[CI/Build] LoRA : make add_lora_test safer (#15181)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-20 21:28:53 -04:00 · 2025-03-20 21:28:53 -04:00 · 0cfe7d386d
commit 0cfe7d386d
parent 0c6f5023c3
1 changed files with 8 additions and 40 deletions
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
 from pathlib import Path
 import pytest
 from huggingface_hub import snapshot_download
 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+MODEL_PATH = "THUDM/chatglm3-6b"
-LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
+LORA_RANK = 64
-LORA_RANK = 8
+DEFAULT_MAX_LORAS = 4 * 3
 DEFAULT_MAX_LORAS = 16 * 3
 def download_and_prepare_lora_module():
    """
    Request submission is expensive when the LoRA adapters have their own
    tokenizers. This is because, for each request with a new LoRA adapter ID,
    the front-end loads the tokenizer from disk.
    In this test, as we are comparing request processing times, we want to
    minimize any extra activity. To this effect, we download the LoRA
    adapter and remove all the tokenizer files, so the engine will default
    to the base model tokenizer.
    """
    global LORA_MODULE_DOWNLOAD_PATH
    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
    tokenizer_files = [
        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
        'tokenizer.model'
    ]
    for tokenizer_file in tokenizer_files:
        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
        del_path.unlink(missing_ok=True)
@pytest.fixture(autouse=True)
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
    pass
-def get_lora_requests() -> list[LoRARequest]:
+def get_lora_requests(lora_path) -> list[LoRARequest]:
    lora_requests: list[LoRARequest] = [
-        LoRARequest(lora_name=f"{i}",
+        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
                    lora_int_id=i,
                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
        for i in range(1, DEFAULT_MAX_LORAS + 1)
    ]
    return lora_requests
@ -93,7 +63,7 @@ async def requests_processing_time(llm,
@pytest.mark.asyncio
-async def test_add_lora():
+async def test_add_lora(chatglm3_lora_files):
    """ 
    The add_lora function is used to pre-load some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
@ -103,10 +73,7 @@ async def test_add_lora():
    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.
    """
-
+    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
    download_and_prepare_lora_module()
    lora_requests: list[LoRARequest] = get_lora_requests()
    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
    # Create engine in eager-mode. Due to high max_loras, the CI can
@ -118,6 +85,7 @@ async def test_add_lora():
        max_lora_rank=LORA_RANK,
        max_model_len=128,
        gpu_memory_utilization=0.8,  #avoid OOM
        trust_remote_code=True,
        enforce_eager=True)
    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`