[CI/Build] LoRA : make add_lora_test safer (#15181)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-20 21:28:53 -04:00 · 2025-03-20 21:28:53 -04:00 · 0cfe7d386d
commit 0cfe7d386d
parent 0c6f5023c3
1 changed files with 8 additions and 40 deletions
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
-from pathlib import Path

 import pytest
-from huggingface_hub import snapshot_download

 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators

-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
-LORA_RANK = 8
-DEFAULT_MAX_LORAS = 16 * 3
-
-
-def download_and_prepare_lora_module():
-    """
-    Request submission is expensive when the LoRA adapters have their own
-    tokenizers. This is because, for each request with a new LoRA adapter ID,
-    the front-end loads the tokenizer from disk.
-
-    In this test, as we are comparing request processing times, we want to
-    minimize any extra activity. To this effect, we download the LoRA
-    adapter and remove all the tokenizer files, so the engine will default
-    to the base model tokenizer.
-    """
-    global LORA_MODULE_DOWNLOAD_PATH
-
-    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
-    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
-
-    tokenizer_files = [
-        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
-        'tokenizer.model'
-    ]
-    for tokenizer_file in tokenizer_files:
-        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
-        del_path.unlink(missing_ok=True)
+MODEL_PATH = "THUDM/chatglm3-6b"
+LORA_RANK = 64
+DEFAULT_MAX_LORAS = 4 * 3


@pytest.fixture(autouse=True)
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
    pass


-def get_lora_requests() -> list[LoRARequest]:
+def get_lora_requests(lora_path) -> list[LoRARequest]:
    lora_requests: list[LoRARequest] = [
-        LoRARequest(lora_name=f"{i}",
-                    lora_int_id=i,
-                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
+        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
        for i in range(1, DEFAULT_MAX_LORAS + 1)
    ]
    return lora_requests
@ -93,7 +63,7 @@ async def requests_processing_time(llm,


@pytest.mark.asyncio
-async def test_add_lora():
+async def test_add_lora(chatglm3_lora_files):
    """ 
    The add_lora function is used to pre-load some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
@ -103,10 +73,7 @@ async def test_add_lora():
    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.
    """
-
-    download_and_prepare_lora_module()
-
-    lora_requests: list[LoRARequest] = get_lora_requests()
+    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)

    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
    # Create engine in eager-mode. Due to high max_loras, the CI can
@ -118,6 +85,7 @@ async def test_add_lora():
        max_lora_rank=LORA_RANK,
        max_model_len=128,
        gpu_memory_utilization=0.8,  #avoid OOM
+        trust_remote_code=True,
        enforce_eager=True)

    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`