[CI/Build] LoRA : make add_lora_test safer (#15181)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
parent
0c6f5023c3
commit
0cfe7d386d
@ -1,10 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
import vllm.envs as env
|
import vllm.envs as env
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import merge_async_iterators
|
from vllm.utils import merge_async_iterators
|
||||||
|
|
||||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||||
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
|
LORA_RANK = 64
|
||||||
LORA_RANK = 8
|
DEFAULT_MAX_LORAS = 4 * 3
|
||||||
DEFAULT_MAX_LORAS = 16 * 3
|
|
||||||
|
|
||||||
|
|
||||||
def download_and_prepare_lora_module():
|
|
||||||
"""
|
|
||||||
Request submission is expensive when the LoRA adapters have their own
|
|
||||||
tokenizers. This is because, for each request with a new LoRA adapter ID,
|
|
||||||
the front-end loads the tokenizer from disk.
|
|
||||||
|
|
||||||
In this test, as we are comparing request processing times, we want to
|
|
||||||
minimize any extra activity. To this effect, we download the LoRA
|
|
||||||
adapter and remove all the tokenizer files, so the engine will default
|
|
||||||
to the base model tokenizer.
|
|
||||||
"""
|
|
||||||
global LORA_MODULE_DOWNLOAD_PATH
|
|
||||||
|
|
||||||
LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
|
|
||||||
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
|
|
||||||
|
|
||||||
tokenizer_files = [
|
|
||||||
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
|
|
||||||
'tokenizer.model'
|
|
||||||
]
|
|
||||||
for tokenizer_file in tokenizer_files:
|
|
||||||
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
|
|
||||||
del_path.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_lora_requests() -> list[LoRARequest]:
|
def get_lora_requests(lora_path) -> list[LoRARequest]:
|
||||||
lora_requests: list[LoRARequest] = [
|
lora_requests: list[LoRARequest] = [
|
||||||
LoRARequest(lora_name=f"{i}",
|
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
|
||||||
lora_int_id=i,
|
|
||||||
lora_path=LORA_MODULE_DOWNLOAD_PATH)
|
|
||||||
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
||||||
]
|
]
|
||||||
return lora_requests
|
return lora_requests
|
||||||
@ -93,7 +63,7 @@ async def requests_processing_time(llm,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_lora():
|
async def test_add_lora(chatglm3_lora_files):
|
||||||
"""
|
"""
|
||||||
The add_lora function is used to pre-load some LoRA adapters into the
|
The add_lora function is used to pre-load some LoRA adapters into the
|
||||||
engine in anticipation of future requests using these adapters. To test
|
engine in anticipation of future requests using these adapters. To test
|
||||||
@ -103,10 +73,7 @@ async def test_add_lora():
|
|||||||
We measure the request processing time in both cases and expect the time
|
We measure the request processing time in both cases and expect the time
|
||||||
to be lesser in the case with add_lora() calls.
|
to be lesser in the case with add_lora() calls.
|
||||||
"""
|
"""
|
||||||
|
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
||||||
download_and_prepare_lora_module()
|
|
||||||
|
|
||||||
lora_requests: list[LoRARequest] = get_lora_requests()
|
|
||||||
|
|
||||||
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
||||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||||
@ -118,6 +85,7 @@ async def test_add_lora():
|
|||||||
max_lora_rank=LORA_RANK,
|
max_lora_rank=LORA_RANK,
|
||||||
max_model_len=128,
|
max_model_len=128,
|
||||||
gpu_memory_utilization=0.8, #avoid OOM
|
gpu_memory_utilization=0.8, #avoid OOM
|
||||||
|
trust_remote_code=True,
|
||||||
enforce_eager=True)
|
enforce_eager=True)
|
||||||
|
|
||||||
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
|
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
|
||||||
|
Loading…
x
Reference in New Issue
Block a user