149 lines
4.4 KiB
Python
149 lines
4.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
from __future__ import annotations
|
|
|
|
import dataclasses
|
|
|
|
import pytest
|
|
|
|
from vllm.config import CompilationLevel
|
|
from vllm.utils import cuda_device_count_stateless
|
|
|
|
from ..utils import compare_all_settings
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class TestSetting:
|
|
model: str
|
|
model_args: list[str]
|
|
pp_size: int
|
|
tp_size: int
|
|
attn_backend: str
|
|
method: str
|
|
fullgraph: bool
|
|
|
|
|
|
# we cannot afford testing the full Catesian product
|
|
# of all models and all levels
|
|
@pytest.mark.parametrize(
|
|
"test_setting",
|
|
[
|
|
# basic llama model
|
|
TestSetting(
|
|
model="meta-llama/Llama-3.2-1B-Instruct",
|
|
model_args=[],
|
|
pp_size=2,
|
|
tp_size=2,
|
|
attn_backend="FLASHINFER",
|
|
method="generate",
|
|
fullgraph=True,
|
|
),
|
|
# llama model with quantization
|
|
TestSetting(
|
|
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
|
model_args=["--quantization", "gptq"],
|
|
pp_size=1,
|
|
tp_size=1,
|
|
attn_backend="FLASH_ATTN",
|
|
method="generate",
|
|
fullgraph=True,
|
|
),
|
|
# MoE model
|
|
TestSetting(
|
|
model="ibm/PowerMoE-3b",
|
|
model_args=[],
|
|
pp_size=1,
|
|
tp_size=2,
|
|
attn_backend="FLASH_ATTN",
|
|
method="generate",
|
|
fullgraph=True,
|
|
),
|
|
# embedding model
|
|
TestSetting(
|
|
model="BAAI/bge-multilingual-gemma2",
|
|
model_args=["--task", "embed", "--dtype", "bfloat16"],
|
|
pp_size=1,
|
|
tp_size=1,
|
|
attn_backend="FLASH_ATTN",
|
|
method="encode",
|
|
fullgraph=True,
|
|
),
|
|
# encoder-based embedding model (BERT)
|
|
TestSetting(
|
|
model="BAAI/bge-base-en-v1.5",
|
|
model_args=["--task", "embed"],
|
|
pp_size=1,
|
|
tp_size=1,
|
|
attn_backend="XFORMERS",
|
|
method="encode",
|
|
fullgraph=True,
|
|
),
|
|
# vision language model
|
|
TestSetting(
|
|
model="microsoft/Phi-3.5-vision-instruct",
|
|
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
|
pp_size=2,
|
|
tp_size=1,
|
|
attn_backend="FLASH_ATTN",
|
|
method="generate_with_image",
|
|
fullgraph=False,
|
|
),
|
|
])
|
|
def test_compile_correctness(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
test_setting: TestSetting,
|
|
):
|
|
# this test is run under multiple suits, with different GPUs.
|
|
# make sure we only run the test with correct CUDA devices.
|
|
# don't use "<", as it will duplicate the tests.
|
|
model = test_setting.model
|
|
model_args = test_setting.model_args
|
|
pp_size = test_setting.pp_size
|
|
tp_size = test_setting.tp_size
|
|
attn_backend = test_setting.attn_backend
|
|
method = test_setting.method
|
|
fullgraph = test_setting.fullgraph
|
|
if cuda_device_count_stateless() != pp_size * tp_size:
|
|
pytest.skip("Not correct CUDA devices for the test.")
|
|
|
|
with monkeypatch.context() as m:
|
|
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
|
final_args = [
|
|
"--enforce-eager", *model_args, "-pp",
|
|
str(pp_size), "-tp",
|
|
str(tp_size)
|
|
]
|
|
|
|
all_args: list[list[str]] = []
|
|
all_envs: list[dict[str, str] | None] = []
|
|
|
|
for level in [
|
|
CompilationLevel.NO_COMPILATION,
|
|
CompilationLevel.PIECEWISE,
|
|
]:
|
|
all_args.append(final_args + [f"-O{level}"])
|
|
all_envs.append({})
|
|
|
|
# inductor will change the output, so we only compare if the output
|
|
# is close, not exactly the same.
|
|
compare_all_settings(
|
|
model,
|
|
all_args,
|
|
all_envs,
|
|
method=method if method != "generate" else "generate_close")
|
|
all_envs.clear()
|
|
all_args.clear()
|
|
|
|
for level in [
|
|
CompilationLevel.NO_COMPILATION,
|
|
CompilationLevel.DYNAMO_AS_IS,
|
|
CompilationLevel.DYNAMO_ONCE,
|
|
]:
|
|
all_args.append(final_args + [f"-O{level}"])
|
|
all_envs.append({})
|
|
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
|
# "DYNAMO_ONCE" will always use fullgraph
|
|
all_envs[-1][
|
|
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
|
|
|
compare_all_settings(model, all_args * 3, all_envs, method=method)
|