2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2025-03-28 16:01:09 -04:00
|
|
|
from typing import Any, Union
|
2025-03-17 11:35:57 +08:00
|
|
|
|
2024-08-15 22:38:56 -07:00
|
|
|
import pytest
|
2025-03-17 11:35:57 +08:00
|
|
|
import torch
|
2024-08-15 22:38:56 -07:00
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
from tests.quantization.utils import is_quant_method_supported
|
|
|
|
from vllm import LLM, SamplingParams
|
2025-03-28 16:01:09 -04:00
|
|
|
from vllm.config import CompilationConfig, CompilationLevel
|
2025-03-17 11:35:57 +08:00
|
|
|
from vllm.platforms import current_platform
|
2024-08-15 22:38:56 -07:00
|
|
|
|
2025-03-17 19:33:35 +08:00
|
|
|
from ..utils import create_new_process_for_each_test
|
2024-09-14 09:46:04 -07:00
|
|
|
|
|
|
|
|
2025-03-28 16:01:09 -04:00
|
|
|
def models_list(all: bool):
|
2025-03-17 11:35:57 +08:00
|
|
|
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
|
|
|
|
("facebook/opt-125m", {}),
|
|
|
|
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
|
|
|
"dtype": torch.float16,
|
|
|
|
"quantization": "compressed-tensors"
|
|
|
|
}),
|
|
|
|
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
|
|
|
"dtype": torch.float16,
|
|
|
|
"quantization": "compressed-tensors"
|
|
|
|
}),
|
|
|
|
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
|
|
|
"quantization": "compressed-tensors"
|
|
|
|
}),
|
|
|
|
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
|
|
|
]
|
|
|
|
|
2025-03-28 16:01:09 -04:00
|
|
|
if not all:
|
|
|
|
return TEST_MODELS
|
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
if is_quant_method_supported("aqlm"):
|
|
|
|
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
|
|
|
"quantization": "aqlm"
|
|
|
|
}))
|
|
|
|
|
|
|
|
# TODO: figure out why this fails.
|
|
|
|
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
|
|
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
|
|
|
"quantization": "gguf"
|
|
|
|
}))
|
|
|
|
|
|
|
|
if is_quant_method_supported("gptq"):
|
|
|
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
|
|
|
"quantization": "gptq"
|
|
|
|
}))
|
|
|
|
|
|
|
|
if is_quant_method_supported("gptq_marlin"):
|
|
|
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
|
|
|
"quantization": "gptq_marlin"
|
|
|
|
}))
|
|
|
|
|
|
|
|
if is_quant_method_supported("gptq_marlin_24"):
|
|
|
|
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
|
|
|
"quantization": "gptq_marlin_24"
|
|
|
|
}))
|
|
|
|
|
|
|
|
if is_quant_method_supported("marlin"):
|
|
|
|
TEST_MODELS.append(
|
|
|
|
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
|
|
|
"quantization": "marlin"
|
|
|
|
}))
|
|
|
|
|
|
|
|
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
|
|
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
|
|
|
"quantization": "AWQ"
|
|
|
|
}))
|
|
|
|
|
|
|
|
return TEST_MODELS
|
|
|
|
|
|
|
|
|
2024-10-10 12:39:36 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"optimization_level",
|
2025-03-17 11:35:57 +08:00
|
|
|
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
|
|
|
)
|
2025-03-28 16:01:09 -04:00
|
|
|
@pytest.mark.parametrize("model_info", models_list(all=True))
|
2025-03-17 19:33:35 +08:00
|
|
|
@create_new_process_for_each_test()
|
2025-03-17 11:35:57 +08:00
|
|
|
def test_full_graph(
|
|
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
|
|
model_info: tuple[str, dict[str, Any]],
|
|
|
|
optimization_level: int,
|
|
|
|
):
|
|
|
|
model, model_kwargs = model_info
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
# make sure these models can be captured in full graph mode
|
|
|
|
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
|
|
|
|
print(f"MODEL={model}")
|
|
|
|
|
2025-03-28 16:01:09 -04:00
|
|
|
run_model(optimization_level, model, model_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
# TODO(luka) add other supported compilation config scenarios here
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"compilation_config",
|
|
|
|
# additional compile sizes
|
|
|
|
[
|
|
|
|
CompilationConfig(level=CompilationLevel.PIECEWISE,
|
|
|
|
compile_sizes=[1, 2])
|
|
|
|
])
|
|
|
|
# only test some of the models
|
|
|
|
@pytest.mark.parametrize("model_info", models_list(all=False))
|
|
|
|
@create_new_process_for_each_test()
|
|
|
|
def test_custom_compile_config(
|
|
|
|
model_info: tuple[str, dict[str, Any]],
|
|
|
|
compilation_config: CompilationConfig,
|
|
|
|
):
|
|
|
|
model, model_kwargs = model_info
|
|
|
|
print(f"MODEL={model}")
|
|
|
|
run_model(compilation_config, model, model_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def run_model(compile_config: Union[int, CompilationConfig], model: str,
|
|
|
|
model_kwargs: dict[str, Any]):
|
|
|
|
prompts = [
|
|
|
|
"Hello, my name is",
|
|
|
|
"The president of the United States is",
|
|
|
|
"The capital of France is",
|
|
|
|
"The future of AI is",
|
|
|
|
]
|
|
|
|
sampling_params = SamplingParams(temperature=0)
|
|
|
|
llm = LLM(
|
|
|
|
model=model,
|
|
|
|
enforce_eager=True,
|
|
|
|
tensor_parallel_size=1,
|
|
|
|
disable_custom_all_reduce=True,
|
|
|
|
compilation_config=compile_config,
|
|
|
|
**model_kwargs,
|
|
|
|
)
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
|
|
|
|
|
|
# Print the outputs.
|
|
|
|
for output in outputs:
|
|
|
|
prompt = output.prompt
|
|
|
|
generated_text = output.outputs[0].text
|
|
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|