[aDAG] Unflake aDAG + PP tests (#7600)
This commit is contained in:
parent
bae888cb8e
commit
4706eb628e
@ -314,11 +314,11 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/distributed/test_pipeline_parallel
|
|
||||||
- tests/distributed/test_pp_cudagraph.py
|
- tests/distributed/test_pp_cudagraph.py
|
||||||
|
- tests/distributed/test_pipeline_parallel
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA Long Context (Distributed) # 11min
|
- label: LoRA Long Context (Distributed) # 11min
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
@ -80,6 +80,10 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||||
}
|
}
|
||||||
|
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||||
|
# terminate because of aDAG issue.
|
||||||
|
pp_args.append("--disable-frontend-multiprocessing")
|
||||||
|
tp_args.append("--disable-frontend-multiprocessing")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
|
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
|
||||||
|
@ -384,6 +384,7 @@ def fork_new_process_for_each_test(
|
|||||||
os.setpgrp()
|
os.setpgrp()
|
||||||
from _pytest.outcomes import Skipped
|
from _pytest.outcomes import Skipped
|
||||||
pid = os.fork()
|
pid = os.fork()
|
||||||
|
print(f"Fork a new process to run a test {pid}")
|
||||||
if pid == 0:
|
if pid == 0:
|
||||||
try:
|
try:
|
||||||
f(*args, **kwargs)
|
f(*args, **kwargs)
|
||||||
@ -401,11 +402,11 @@ def fork_new_process_for_each_test(
|
|||||||
pgid = os.getpgid(pid)
|
pgid = os.getpgid(pid)
|
||||||
_pid, _exitcode = os.waitpid(pid, 0)
|
_pid, _exitcode = os.waitpid(pid, 0)
|
||||||
# ignore SIGTERM signal itself
|
# ignore SIGTERM signal itself
|
||||||
old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||||
# kill all child processes
|
# kill all child processes
|
||||||
os.killpg(pgid, signal.SIGTERM)
|
os.killpg(pgid, signal.SIGTERM)
|
||||||
# restore the signal handler
|
# restore the signal handler
|
||||||
signal.signal(signal.SIGTERM, old_singla_handler)
|
signal.signal(signal.SIGTERM, old_signal_handler)
|
||||||
assert _exitcode == 0, (f"function {f} failed when called with"
|
assert _exitcode == 0, (f"function {f} failed when called with"
|
||||||
f" args {args} and kwargs {kwargs}")
|
f" args {args} and kwargs {kwargs}")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user