vllm/tests/compile/test_basic_correctness.py

# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

import dataclasses

import pytest

from vllm.config import CompilationLevel
from vllm.utils import cuda_device_count_stateless

from ..utils import compare_all_settings


@dataclasses.dataclass
class TestSetting:
    model: str
    model_args: list[str]
    pp_size: int
    tp_size: int
    attn_backend: str
    method: str
    fullgraph: bool


# we cannot afford testing the full Catesian product
# of all models and all levels
@pytest.mark.parametrize(
    "test_setting",
    [
        # basic llama model
        TestSetting(
            model="meta-llama/Llama-3.2-1B-Instruct",
            model_args=[],
            pp_size=2,
            tp_size=2,
            attn_backend="FLASHINFER",
            method="generate",
            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
            model_args=["--quantization", "gptq"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # MoE model
        TestSetting(
            model="ibm/PowerMoE-3b",
            model_args=[],
            pp_size=1,
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
            model_args=["--task", "embed"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
            fullgraph=True,
        ),
        # encoder-based embedding model (BERT)
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
            model_args=["--task", "embed"],
            pp_size=1,
            tp_size=1,
            attn_backend="XFORMERS",
            method="encode",
            fullgraph=True,
        ),
        # vision language model
        TestSetting(
            model="microsoft/Phi-3.5-vision-instruct",
            model_args=["--trust-remote-code", "--max-model-len", "2048"],
            pp_size=2,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
            fullgraph=False,
        ),
    ])
def test_compile_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,
):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
    model = test_setting.model
    model_args = test_setting.model_args
    pp_size = test_setting.pp_size
    tp_size = test_setting.tp_size
    attn_backend = test_setting.attn_backend
    method = test_setting.method
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")

    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        final_args = [
            "--enforce-eager", *model_args, "-pp",
            str(pp_size), "-tp",
            str(tp_size)
        ]

        all_args: list[list[str]] = []
        all_envs: list[dict[str, str] | None] = []

        for level in [
                CompilationLevel.NO_COMPILATION,
                CompilationLevel.PIECEWISE,
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})

        # inductor will change the output, so we only compare if the output
        # is close, not exactly the same.
        compare_all_settings(
            model,
            all_args,
            all_envs,
            method=method if method != "generate" else "generate_close")
        all_envs.clear()
        all_args.clear()

        for level in [
                CompilationLevel.NO_COMPILATION,
                CompilationLevel.DYNAMO_AS_IS,
                CompilationLevel.DYNAMO_ONCE,
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
                # "DYNAMO_ONCE" will always use fullgraph
                all_envs[-1][
                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore

        compare_all_settings(model, all_args * 3, all_envs, method=method)