[aDAG] Unflake aDAG + PP tests (#7600)

This commit is contained in:
SangBin Cho 2024-08-16 20:49:30 -07:00 committed by GitHub
parent bae888cb8e
commit 4706eb628e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 9 additions and 4 deletions

View File

@ -314,11 +314,11 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/
- tests/distributed/test_pipeline_parallel
- tests/distributed/test_pp_cudagraph.py
- tests/distributed/test_pipeline_parallel
commands:
- pytest -v -s distributed/test_pipeline_parallel.py
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA Long Context (Distributed) # 11min
# This test runs llama 13B, so it is required to run on 4 GPUs.

View File

@ -80,6 +80,10 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
pp_args.append("--disable-frontend-multiprocessing")
tp_args.append("--disable-frontend-multiprocessing")
try:
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)

View File

@ -384,6 +384,7 @@ def fork_new_process_for_each_test(
os.setpgrp()
from _pytest.outcomes import Skipped
pid = os.fork()
print(f"Fork a new process to run a test {pid}")
if pid == 0:
try:
f(*args, **kwargs)
@ -401,11 +402,11 @@ def fork_new_process_for_each_test(
pgid = os.getpgid(pid)
_pid, _exitcode = os.waitpid(pid, 0)
# ignore SIGTERM signal itself
old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
# kill all child processes
os.killpg(pgid, signal.SIGTERM)
# restore the signal handler
signal.signal(signal.SIGTERM, old_singla_handler)
signal.signal(signal.SIGTERM, old_signal_handler)
assert _exitcode == 0, (f"function {f} failed when called with"
f" args {args} and kwargs {kwargs}")