[Bugfix] Further clean up LoRA test (#14422)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-07 18:30:55 +08:00 · 2025-03-07 18:30:55 +08:00 · 12c29a881f
commit 12c29a881f
parent 70da0c0748
3 changed files with 2 additions and 48 deletions
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -185,11 +185,6 @@ def mixtral_lora_files():
    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
@pytest.fixture(scope="session")
 def mixtral_lora_files_all_target_modules():
    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
@pytest.fixture(scope="session")
 def gemma_lora_files():
    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@ -69,45 +69,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                     prompts=prompts) == expected_lora_output
    assert do_sample(llm, mixtral_lora_files, lora_id=2,
                     prompts=prompts) == expected_lora_output
@pytest.mark.parametrize("tp_size", [4])
@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
                                         tp_size, fully_shard):
    """This LoRA model has all supported Mixtral target modules"""
    if torch.cuda.device_count() < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    prompts = [
        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
    ]
    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        distributed_executor_backend="ray",
        tensor_parallel_size=tp_size,
        fully_sharded_loras=fully_shard,
        max_lora_rank=32,
    )
    expected_lora_output = [
        "A: Nothing happens if you touch the eyes of a blind man.",
        "A: add heat",
        "1: Craig",
    ]
    assert do_sample(llm,
                     mixtral_lora_files_all_target_modules,
                     lora_id=1,
                     prompts=prompts) == expected_lora_output
    assert do_sample(llm,
                     mixtral_lora_files_all_target_modules,
                     lora_id=2,
                     prompts=prompts) == expected_lora_output
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@ -178,7 +178,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                 model):
    if num_gpus_available < 2:
        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-
+    if model.quantization == "GPTQ":
        pytest.skip("GPTQ lora outputs are just incredibly unstable")
    llm_tp1 = vllm.LLM(
        model=model.model_path,
        enable_lora=True,