vllm/tests/lora/test_lora_functions.py
Robert Shaw d4d93db2c5
[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-03-14 22:02:20 -07:00

137 lines
5.3 KiB
Python

# SPDX-License-Identifier: Apache-2.0
"""
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
"""
import os
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.lora.request import LoRARequest
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_RANK = 8
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def make_lora_request(lora_id: int):
return LoRARequest(lora_name=f"{lora_id}",
lora_int_id=lora_id,
lora_path=LORA_MODULE_PATH)
def test_lora_functions_sync():
max_loras = 4
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = EngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
llm = LLMEngine.from_engine_args(engine_args)
def run_check(fn, args, expected: list):
fn(args)
assert set(llm.list_loras()) == set(expected)
run_check(llm.add_lora, make_lora_request(1), [1])
run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
run_check(llm.pin_lora, 1, [1, 2])
run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
run_check(llm.remove_lora, 1, [8, 9, 10])
run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs
run_check(llm.remove_lora, 13, [12, 10, 11])
run_check(llm.remove_lora, 12, [10, 11])
run_check(llm.remove_lora, 11, [10])
run_check(llm.remove_lora, 10, [])
@pytest.mark.asyncio
async def test_lora_functions_async():
if os.getenv("VLLM_USE_V1") == "0":
pytest.skip(
reason=
"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib
import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
max_loras = 4
engine_args = AsyncEngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
async def run_check(fn, args, expected: list):
await fn(args)
assert set(await llm.list_loras()) == set(expected)
async with build_async_engine_client_from_engine_args(engine_args) as llm:
await run_check(llm.add_lora, make_lora_request(1), [1])
await run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
await run_check(llm.pin_lora, 1, [1, 2])
await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
await run_check(llm.remove_lora, 1, [8, 9, 10])
await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs
await run_check(llm.remove_lora, 13, [12, 10, 11])
await run_check(llm.remove_lora, 12, [10, 11])
await run_check(llm.remove_lora, 11, [10])
await run_check(llm.remove_lora, 10, [])