[aDAG] Unflake aDAG + PP tests (#7600)
This commit is contained in:
parent
bae888cb8e
commit
4706eb628e
@ -314,11 +314,11 @@ steps:
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/distributed/test_pipeline_parallel
|
||||
- tests/distributed/test_pp_cudagraph.py
|
||||
- tests/distributed/test_pipeline_parallel
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: LoRA Long Context (Distributed) # 11min
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
|
@ -80,6 +80,10 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of aDAG issue.
|
||||
pp_args.append("--disable-frontend-multiprocessing")
|
||||
tp_args.append("--disable-frontend-multiprocessing")
|
||||
|
||||
try:
|
||||
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
|
||||
|
@ -384,6 +384,7 @@ def fork_new_process_for_each_test(
|
||||
os.setpgrp()
|
||||
from _pytest.outcomes import Skipped
|
||||
pid = os.fork()
|
||||
print(f"Fork a new process to run a test {pid}")
|
||||
if pid == 0:
|
||||
try:
|
||||
f(*args, **kwargs)
|
||||
@ -401,11 +402,11 @@ def fork_new_process_for_each_test(
|
||||
pgid = os.getpgid(pid)
|
||||
_pid, _exitcode = os.waitpid(pid, 0)
|
||||
# ignore SIGTERM signal itself
|
||||
old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||
# kill all child processes
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
# restore the signal handler
|
||||
signal.signal(signal.SIGTERM, old_singla_handler)
|
||||
signal.signal(signal.SIGTERM, old_signal_handler)
|
||||
assert _exitcode == 0, (f"function {f} failed when called with"
|
||||
f" args {args} and kwargs {kwargs}")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user