diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 644a075b..c8b7a5cb 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import time -from pathlib import Path import pytest -from huggingface_hub import snapshot_download import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs @@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.utils import merge_async_iterators -MODEL_PATH = "meta-llama/Llama-2-7b-hf" -LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa -LORA_RANK = 8 -DEFAULT_MAX_LORAS = 16 * 3 - - -def download_and_prepare_lora_module(): - """ - Request submission is expensive when the LoRA adapters have their own - tokenizers. This is because, for each request with a new LoRA adapter ID, - the front-end loads the tokenizer from disk. - - In this test, as we are comparing request processing times, we want to - minimize any extra activity. To this effect, we download the LoRA - adapter and remove all the tokenizer files, so the engine will default - to the base model tokenizer. - """ - global LORA_MODULE_DOWNLOAD_PATH - - LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test" - LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH) - - tokenizer_files = [ - 'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json', - 'tokenizer.model' - ] - for tokenizer_file in tokenizer_files: - del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file - del_path.unlink(missing_ok=True) +MODEL_PATH = "THUDM/chatglm3-6b" +LORA_RANK = 64 +DEFAULT_MAX_LORAS = 4 * 3 @pytest.fixture(autouse=True) @@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora): pass -def get_lora_requests() -> list[LoRARequest]: +def get_lora_requests(lora_path) -> list[LoRARequest]: lora_requests: list[LoRARequest] = [ - LoRARequest(lora_name=f"{i}", - lora_int_id=i, - lora_path=LORA_MODULE_DOWNLOAD_PATH) + LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) for i in range(1, DEFAULT_MAX_LORAS + 1) ] return lora_requests @@ -93,7 +63,7 @@ async def requests_processing_time(llm, @pytest.mark.asyncio -async def test_add_lora(): +async def test_add_lora(chatglm3_lora_files): """ The add_lora function is used to pre-load some LoRA adapters into the engine in anticipation of future requests using these adapters. To test @@ -103,10 +73,7 @@ async def test_add_lora(): We measure the request processing time in both cases and expect the time to be lesser in the case with add_lora() calls. """ - - download_and_prepare_lora_module() - - lora_requests: list[LoRARequest] = get_lora_requests() + lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files) max_loras = len(set([lr.lora_int_id for lr in lora_requests])) # Create engine in eager-mode. Due to high max_loras, the CI can @@ -118,6 +85,7 @@ async def test_add_lora(): max_lora_rank=LORA_RANK, max_model_len=128, gpu_memory_utilization=0.8, #avoid OOM + trust_remote_code=True, enforce_eager=True) # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`