From 86c3369eb888fc57c8edcbbfd0bc6f22a606f79c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 9 Apr 2025 09:13:56 +0800
Subject: [PATCH] [CI/Build] Fix CI LoRA failure (#16270)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py               | 12 ++++++++++++
 tests/lora/test_baichuan.py          |  1 -
 tests/lora/test_chatglm3_tp.py       |  1 -
 tests/lora/test_layers.py            |  2 +-
 tests/lora/test_llama_tp.py          |  1 -
 tests/lora/test_punica_ops.py        |  5 +++++
 tests/lora/test_quant_model.py       |  9 +--------
 tests/lora/test_transfomers_model.py |  1 -
 8 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 91733fde..dc433f9d 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
         monkeypatch.setenv('VLLM_USE_V1', '0')
 
     yield
+
+
+@pytest.fixture
+def reset_default_device():
+    """
+    Some tests, such as `test_punica_ops.py`, explicitly set the 
+    default device, which can affect subsequent tests. Adding this fixture 
+    helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 4dacbe26..007be7aa 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
                        max_num_seqs=16,
                        max_loras=4,
                        max_lora_rank=64,
-                       tensor_parallel_size=1,
                        trust_remote_code=True,
                        fully_sharded_loras=fully_sharded)
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 28a6f163..2c18a115 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)
 
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index f85725fe..0a8b38fa 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
 
 @pytest.fixture(autouse=True)
-def clean_cache():
+def clean_cache_reset_device(reset_default_device):
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 31abac87..cdb8c893 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
         # also test odd max_num_seqs
         max_num_seqs=13,
         max_loras=4,
-        tensor_parallel_size=1,
         enable_chunked_prefill=True)
     generate_and_test(llm, sql_lora_files)
 
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 726d0c5f..add313c9 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
 
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
+
+
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
         nslices: int, inputs_tensor: torch.Tensor,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index a4a47a9c..caf71976 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", [1])
-def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-                          tp_size):
-    if num_gpus_available < tp_size and \
-        tp_size > 1 and current_platform.is_cuda_alike():
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+def test_quant_model_lora(tinyllama_lora_files, model):
 
     llm = vllm.LLM(
         model=model.model_path,
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
         max_num_seqs=16,
         max_loras=4,
         max_model_len=400,
-        tensor_parallel_size=tp_size,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
-        tensor_parallel_size=1,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index b50e210e..63907f2c 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=16,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)