vllm/tests/quantization/test_configs.py

"""Tests whether Marlin models can be loaded from the autogptq config.

Run `pytest tests/quantization/test_configs.py --forked`.
"""

from dataclasses import dataclass
from typing import Tuple

import pytest

from vllm.config import ModelConfig


@dataclass
class ModelPair:
    model_marlin: str
    model_gptq: str


# Model Id // Quantization Arg // Expected Type
MODEL_ARG_EXPTYPES = [
    # AUTOGPTQ
    # compat: autogptq <=0.7.1 is_marlin_format: bool
    # Model Serialized in Marlin Format should always use Marlin kernel.
    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
    # Model Serialized in Exllama Format.
    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
    ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
    # compat: autogptq >=0.8.0 use checkpoint_format: str
    # Model Serialized in Marlin Format should always use Marlin kernel.
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
    # Model Serialized in Exllama Format.
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),

    # AUTOAWQ
    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
]


@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
    model_path, quantization_arg, expected_type = model_arg_exptype

    try:
        model_config = ModelConfig(model_path,
                                   model_path,
                                   tokenizer_mode="auto",
                                   trust_remote_code=False,
                                   seed=0,
                                   dtype="float16",
                                   revision=None,
                                   quantization=quantization_arg)
        found_quantization_type = model_config.quantization
    except ValueError:
        found_quantization_type = "ERROR"

    assert found_quantization_type == expected_type, (
        f"Expected quant_type == {expected_type} for {model_path}, "
        f"but found {found_quantization_type} "
        f"for no --quantization {quantization_arg} case")
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`"""Tests whether Marlin models can be loaded from the autogptq config.`

			Run `pytest tests/quantization/test_configs.py --forked`.
			`"""`

			`from dataclasses import dataclass`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`from typing import Tuple`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00
			`import pytest`

			`from vllm.config import ModelConfig`


			`@dataclass`
			`class ModelPair:`
			`model_marlin: str`
			`model_gptq: str`


			`# Model Id // Quantization Arg // Expected Type`
			`MODEL_ARG_EXPTYPES = [`
			`# AUTOGPTQ`
			`# compat: autogptq <=0.7.1 is_marlin_format: bool`
			`# Model Serialized in Marlin Format should always use Marlin kernel.`
			`("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),`
			`("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),`
			`("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),`
			`("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),`
			`# Model Serialized in Exllama Format.`
			`("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),`
			`("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),`
			`("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),`
			`("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),`
			`# compat: autogptq >=0.8.0 use checkpoint_format: str`
			`# Model Serialized in Marlin Format should always use Marlin kernel.`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),`
			`# Model Serialized in Exllama Format.`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),`
			`("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),`

			`# AUTOAWQ`
[Kernel][Core] Add AWQ support to the Marlin kernel (#6612) 2024-07-21 19:41:42 -04:00			`("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),`
[Kernel][Core] Add AWQ support to the Marlin kernel (#6612) 2024-07-21 19:41:42 -04:00			`("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),`
			`]`


			`@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`model_path, quantization_arg, expected_type = model_arg_exptype`

			`try:`
			`model_config = ModelConfig(model_path,`
			`model_path,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`revision=None,`
			`quantization=quantization_arg)`
			`found_quantization_type = model_config.quantization`
			`except ValueError:`
			`found_quantization_type = "ERROR"`

			`assert found_quantization_type == expected_type, (`
			`f"Expected quant_type == {expected_type} for {model_path}, "`
			`f"but found {found_quantization_type} "`
			`f"for no --quantization {quantization_arg} case")`