From 86c3369eb888fc57c8edcbbfd0bc6f22a606f79c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 9 Apr 2025 09:13:56 +0800 Subject: [PATCH] [CI/Build] Fix CI LoRA failure (#16270) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 12 ++++++++++++ tests/lora/test_baichuan.py | 1 - tests/lora/test_chatglm3_tp.py | 1 - tests/lora/test_layers.py | 2 +- tests/lora/test_llama_tp.py | 1 - tests/lora/test_punica_ops.py | 5 +++++ tests/lora/test_quant_model.py | 9 +-------- tests/lora/test_transfomers_model.py | 1 - 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 91733fde..dc433f9d 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch): monkeypatch.setenv('VLLM_USE_V1', '0') yield + + +@pytest.fixture +def reset_default_device(): + """ + Some tests, such as `test_punica_ops.py`, explicitly set the + default device, which can affect subsequent tests. Adding this fixture + helps avoid this problem. + """ + original_device = torch.get_default_device() + yield + torch.set_default_device(original_device) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 4dacbe26..007be7aa 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, max_num_seqs=16, max_loras=4, max_lora_rank=64, - tensor_parallel_size=1, trust_remote_code=True, fully_sharded_loras=fully_sharded) output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 28a6f163..2c18a115 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files): enable_lora=True, max_loras=4, max_lora_rank=64, - tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index f85725fe..0a8b38fa 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 @pytest.fixture(autouse=True) -def clean_cache(): +def clean_cache_reset_device(reset_default_device): # Release any memory we might be holding on to. CI runs OOMs otherwise. from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, _LORA_B_PTR_DICT) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 31abac87..cdb8c893 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files): # also test odd max_num_seqs max_num_seqs=13, max_loras=4, - tensor_parallel_size=1, enable_chunked_prefill=True) generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 726d0c5f..add313c9 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -13,6 +13,11 @@ from vllm.platforms import current_platform from .utils import PunicaTensors, assert_close, generate_data_for_nslices +@pytest.fixture(autouse=True) +def reset_device(reset_default_device): + pass + + # Utility shrink and expand operations used as reference implementations. def sgmv_shrink_for_nslices( nslices: int, inputs_tensor: torch.Tensor, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index a4a47a9c..caf71976 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM, @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", [1]) -def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, - tp_size): - if num_gpus_available < tp_size and \ - tp_size > 1 and current_platform.is_cuda_alike(): - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") +def test_quant_model_lora(tinyllama_lora_files, model): llm = vllm.LLM( model=model.model_path, @@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, max_num_seqs=16, max_loras=4, max_model_len=400, - tensor_parallel_size=tp_size, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True, @@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, enable_lora=True, max_num_seqs=16, max_loras=4, - tensor_parallel_size=1, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True, diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index b50e210e..63907f2c 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files): enable_lora=True, max_loras=4, max_lora_rank=16, - tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True)