
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
138 lines
4.8 KiB
Python
138 lines
4.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
import asyncio
|
|
import time
|
|
|
|
import pytest
|
|
|
|
import vllm.envs as env
|
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
from vllm.inputs import TextPrompt
|
|
from vllm.lora.request import LoRARequest
|
|
from vllm.sampling_params import SamplingParams
|
|
from vllm.utils import merge_async_iterators
|
|
|
|
MODEL_PATH = "THUDM/chatglm3-6b"
|
|
LORA_RANK = 64
|
|
DEFAULT_MAX_LORAS = 4 * 3
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def v1(run_with_both_engines_lora):
|
|
# Simple autouse wrapper to run both engines for each test
|
|
# This can be promoted up to conftest.py to run for every
|
|
# test in a package
|
|
pass
|
|
|
|
|
|
def get_lora_requests(lora_path) -> list[LoRARequest]:
|
|
lora_requests: list[LoRARequest] = [
|
|
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
|
|
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
|
]
|
|
return lora_requests
|
|
|
|
|
|
async def requests_processing_time(llm,
|
|
lora_requests: list[LoRARequest]) -> float:
|
|
|
|
sampling_params = SamplingParams(n=1,
|
|
temperature=0.0,
|
|
top_p=1.0,
|
|
ignore_eos=True,
|
|
max_tokens=1)
|
|
|
|
generators = []
|
|
start = time.perf_counter()
|
|
|
|
for lora_request in lora_requests:
|
|
lora_int_id = lora_request.lora_int_id
|
|
generator = llm.generate(
|
|
prompt=TextPrompt(prompt=f"hello {lora_int_id}",
|
|
multi_modal_data=None), # type: ignore
|
|
sampling_params=sampling_params,
|
|
lora_request=lora_request,
|
|
request_id=f"test{lora_int_id}")
|
|
generators.append(generator)
|
|
|
|
all_gens = merge_async_iterators(*generators)
|
|
async for i, res in all_gens:
|
|
pass
|
|
|
|
end = time.perf_counter()
|
|
return end - start
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_add_lora(chatglm3_lora_files):
|
|
"""
|
|
The add_lora function is used to pre-load some LoRA adapters into the
|
|
engine in anticipation of future requests using these adapters. To test
|
|
this functionality, we use the async engine to process some requests - We
|
|
do it twice, once with add_lora() pre-loading and once without.
|
|
|
|
We measure the request processing time in both cases and expect the time
|
|
to be lesser in the case with add_lora() calls.
|
|
"""
|
|
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
|
|
|
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
|
# Create engine in eager-mode. Due to high max_loras, the CI can
|
|
# OOM during cuda-graph capture.
|
|
engine_args = AsyncEngineArgs(
|
|
model=MODEL_PATH,
|
|
enable_lora=True,
|
|
max_loras=max_loras,
|
|
max_lora_rank=LORA_RANK,
|
|
max_model_len=128,
|
|
gpu_memory_utilization=0.8, #avoid OOM
|
|
trust_remote_code=True,
|
|
enforce_eager=True)
|
|
|
|
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
|
|
# environment variable. reload vllm.enging.async_llm_engine as
|
|
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
|
|
# env var.
|
|
import importlib
|
|
|
|
import vllm.engine.async_llm_engine
|
|
importlib.reload(vllm.engine.async_llm_engine)
|
|
from vllm.entrypoints.openai.api_server import (
|
|
build_async_engine_client_from_engine_args)
|
|
|
|
# split lora_requests into 3 parts
|
|
part_size = len(lora_requests) // 3
|
|
dummy_run_requests = lora_requests[:part_size]
|
|
warmup_run_requests = lora_requests[part_size:part_size * 2]
|
|
cold_run_requests = lora_requests[part_size * 2:]
|
|
|
|
async with build_async_engine_client_from_engine_args(engine_args) as llm:
|
|
|
|
# Dummy run - So any 1-time functionality like triton kernel compilation
|
|
# is complete here.
|
|
await requests_processing_time(llm, dummy_run_requests)
|
|
|
|
# Run with warmup
|
|
add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
|
|
add_lora_results = await asyncio.gather(*add_lora_tasks)
|
|
if env.VLLM_USE_V1:
|
|
# Test that all all_lora calls are successful.
|
|
assert all(add_lora_results)
|
|
else:
|
|
# No way to check V0 engine results as the calls just return None.
|
|
pass
|
|
time_with_add_lora = await requests_processing_time(
|
|
llm, warmup_run_requests)
|
|
|
|
# Run without any warmup
|
|
time_cold_start = await requests_processing_time(
|
|
llm, cold_run_requests)
|
|
|
|
print(f"time hot-start {time_with_add_lora} vs "
|
|
f"time cold-start {time_cold_start} ")
|
|
|
|
assert time_with_add_lora < time_cold_start, (
|
|
f"time_with_add_lora={time_with_add_lora}, "
|
|
f"time_cold_start={time_cold_start}"
|
|
"The engine request processing time with LoRA pre-loading "
|
|
"must be less than the version that does on-demand LoRA loading.")
|