43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
import pytest
|
|
|
|
from vllm import envs
|
|
from vllm.config import VllmConfig
|
|
from vllm.engine.arg_utils import EngineArgs
|
|
from vllm.usage.usage_lib import UsageContext
|
|
|
|
if not envs.VLLM_USE_V1:
|
|
pytest.skip(
|
|
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
|
allow_module_level=True,
|
|
)
|
|
|
|
|
|
def test_defaults():
|
|
engine_args = EngineArgs(model="facebook/opt-125m")
|
|
|
|
# Assert V1 defaults
|
|
assert (engine_args.enable_prefix_caching
|
|
), "V1 turns on prefix caching by default"
|
|
|
|
|
|
def test_defaults_with_usage_context():
|
|
engine_args = EngineArgs(model="facebook/opt-125m")
|
|
vllm_config: VllmConfig = engine_args.create_engine_config(
|
|
UsageContext.LLM_CLASS)
|
|
|
|
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
|
assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
|
|
|
|
engine_args = EngineArgs(model="facebook/opt-125m")
|
|
vllm_config = engine_args.create_engine_config(
|
|
UsageContext.OPENAI_API_SERVER)
|
|
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
|
assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
|
|
|
|
|
|
def test_prefix_cache_disabled_with_multimodel():
|
|
engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
|
|
|
|
vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
|
|
assert not vllm_config.cache_config.enable_prefix_caching
|