[CI/Build] LoRA : make add_lora_test safer (#15181)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
Varun Sundar Rabindranath 2025-03-20 21:28:53 -04:00 committed by GitHub
parent 0c6f5023c3
commit 0cfe7d386d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,10 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
import asyncio
import time
from pathlib import Path
import pytest
from huggingface_hub import snapshot_download
import vllm.envs as env
from vllm.engine.arg_utils import AsyncEngineArgs
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils import merge_async_iterators
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
LORA_RANK = 8
DEFAULT_MAX_LORAS = 16 * 3
def download_and_prepare_lora_module():
"""
Request submission is expensive when the LoRA adapters have their own
tokenizers. This is because, for each request with a new LoRA adapter ID,
the front-end loads the tokenizer from disk.
In this test, as we are comparing request processing times, we want to
minimize any extra activity. To this effect, we download the LoRA
adapter and remove all the tokenizer files, so the engine will default
to the base model tokenizer.
"""
global LORA_MODULE_DOWNLOAD_PATH
LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
tokenizer_files = [
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
'tokenizer.model'
]
for tokenizer_file in tokenizer_files:
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
del_path.unlink(missing_ok=True)
MODEL_PATH = "THUDM/chatglm3-6b"
LORA_RANK = 64
DEFAULT_MAX_LORAS = 4 * 3
@pytest.fixture(autouse=True)
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
pass
def get_lora_requests() -> list[LoRARequest]:
def get_lora_requests(lora_path) -> list[LoRARequest]:
lora_requests: list[LoRARequest] = [
LoRARequest(lora_name=f"{i}",
lora_int_id=i,
lora_path=LORA_MODULE_DOWNLOAD_PATH)
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
for i in range(1, DEFAULT_MAX_LORAS + 1)
]
return lora_requests
@ -93,7 +63,7 @@ async def requests_processing_time(llm,
@pytest.mark.asyncio
async def test_add_lora():
async def test_add_lora(chatglm3_lora_files):
"""
The add_lora function is used to pre-load some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
@ -103,10 +73,7 @@ async def test_add_lora():
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
"""
download_and_prepare_lora_module()
lora_requests: list[LoRARequest] = get_lora_requests()
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
# Create engine in eager-mode. Due to high max_loras, the CI can
@ -118,6 +85,7 @@ async def test_add_lora():
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8, #avoid OOM
trust_remote_code=True,
enforce_eager=True)
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`