From 4203926f10f5170c7ef0dfefb676f3b82a7f5e0a Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 2 Apr 2025 16:39:09 +0800 Subject: [PATCH] [CI/Build] Further clean up LoRA tests (#15920) Signed-off-by: Jee Jee Li --- .buildkite/test-pipeline.yaml | 4 +--- tests/lora/conftest.py | 23 ----------------------- tests/lora/test_layers.py | 2 +- tests/lora/test_llama_tp.py | 17 ----------------- tests/lora/test_minicpmv_tp.py | 1 - tests/lora/test_transfomers_model.py | 8 +++++++- 6 files changed, 9 insertions(+), 46 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 99358d55..e2b452d8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -289,7 +289,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py parallelism: 4 - label: PyTorch Fullgraph Smoke Test # 9min @@ -602,8 +602,6 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_minicpmv_tp.py - - pytest -v -s -x lora/test_transfomers_model.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 523bebe0..91733fde 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,6 @@ import tempfile from collections import OrderedDict -from typing import TypedDict from unittest.mock import MagicMock, patch import pytest @@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.platforms import current_platform -class ContextIDInfo(TypedDict): - lora_id: int - context_length: str - - -class ContextInfo(TypedDict): - lora: str - context_length: str - - -LONG_LORA_INFOS: list[ContextIDInfo] = [{ - "lora_id": 1, - "context_length": "16k", -}, { - "lora_id": 2, - "context_length": "16k", -}, { - "lora_id": 3, - "context_length": "32k", -}] - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 99d60b33..f85725fe 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -59,7 +59,7 @@ DEVICES = ([ # prefill stage(True) or decode stage(False) STAGES = [True, False] -NUM_RANDOM_SEEDS = 10 +NUM_RANDOM_SEEDS = 6 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 9f20e47c..31abac87 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) - - -@multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() -def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - - llm = vllm.LLM( - MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4, - fully_sharded_loras=True, - enable_lora_bias=True, - enable_chunked_prefill=True, - ) - generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 00e6fe7c..0b223e50 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") -@create_new_process_for_each_test() def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 87db0b4b..b50e210e 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,7 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from ..utils import create_new_process_for_each_test, multi_gpu_test @@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@create_new_process_for_each_test() def test_ilama_lora(ilama_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4(ilama_lora_files): @@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):