vllm/tests/models/decoder_only/language/test_gptq_marlin.py

"""Compares the outputs of gptq vs gptq_marlin 
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 5 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.

Run `pytest tests/models/test_gptq_marlin.py`.
"""
import os

import pytest

from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT

from ...utils import check_logprobs_close

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024

MODELS = [
    # act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),

    # 8-bit, act_order==True, group_size=channelwise
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),

    # 4-bit, act_order==True, group_size=128
    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
]


@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
    vllm_runner,
    example_prompts,
    model,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    model_name, revision = model

    # Run marlin.
    with vllm_runner(model_name=model_name,
                     revision=revision,
                     dtype=dtype,
                     quantization="marlin",
                     max_model_len=MAX_MODEL_LEN,
                     tensor_parallel_size=1) as gptq_marlin_model:

        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)
    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error

    # Run gptq.
    # The naive gptq kernel doesn't support bf16 yet.
    # Here we always compare fp16/bf16 gpt marlin kernel
    # to fp16 gptq kernel.
    with vllm_runner(model_name=model_name,
                     revision=revision,
                     dtype="half",
                     quantization="gptq",
                     max_model_len=MAX_MODEL_LEN,
                     tensor_parallel_size=1) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
        outputs_1_lst=gptq_marlin_outputs,
        name_0="gptq",
        name_1="gptq_marlin",
    )
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`"""Compares the outputs of gptq vs gptq_marlin`
			`Note: GPTQ and Marlin do not have bitwise correctness.`
			`As a result, in this test, we just confirm that the top selected tokens of the`
[CI/Build] Tweak Marlin Nondeterminism Issues (#4713) 2024-05-12 20:46:31 -04:00			`Marlin/GPTQ models are in the top 5 selections of each other.`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`Note: Marlin internally uses locks to synchronize the threads. This can`
			`result in very slight nondeterminism for Marlin. As a result, we re-run the test`
			`up to 3 times to see if we pass.`
[CI/Build] Tweak Marlin Nondeterminism Issues (#4713) 2024-05-12 20:46:31 -04:00
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			Run `pytest tests/models/test_gptq_marlin.py`.
			`"""`
			`import os`

			`import pytest`

[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 11:18:08 -04:00			`from tests.quantization.utils import is_quant_method_supported`
[Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 21:55:29 +08:00			`from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00
[CI/Build] Reorganize models tests (#7820) 2024-09-14 01:20:06 +08:00			`from ...utils import check_logprobs_close`
[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`os.environ["TOKENIZERS_PARALLELISM"] = "true"`

			`MAX_MODEL_LEN = 1024`

			`MODELS = [`
			`# act_order==True, group_size=128`
			`("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),`
[Kernel] Support running GPTQ 8-bit models in Marlin (#4533) 2024-05-02 12:56:22 -04:00
			`# 8-bit, act_order==True, group_size=channelwise`
			`("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),`
add gptq_marlin test for bug report https://github.com/vllm-project/vllm/issues/5088 (#5145) 2024-06-15 13:38:16 -04:00
			`# 4-bit, act_order==True, group_size=128`
			`("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`]`


[CI/Build] Split up models tests (#10069) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-10 03:39:14 +08:00			`@pytest.mark.quant_model`
[CI/Build] Tweak Marlin Nondeterminism Issues (#4713) 2024-05-12 20:46:31 -04:00			`@pytest.mark.flaky(reruns=3)`
[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 11:18:08 -04:00			`@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`reason="gptq_marlin is not supported on this GPU type.")`
			`@pytest.mark.parametrize("model", MODELS)`
[Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 21:55:29 +08:00			`@pytest.mark.parametrize("dtype", ["half", "bfloat16"])`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00			`@pytest.mark.parametrize("max_tokens", [32])`
			`@pytest.mark.parametrize("num_logprobs", [5])`
			`def test_models(`
			`vllm_runner,`
			`example_prompts,`
			`model,`
			`dtype: str,`
			`max_tokens: int,`
			`num_logprobs: int,`
			`) -> None:`
			`model_name, revision = model`

			`# Run marlin.`
[CI/Test] improve robustness of test (vllm_runner) (#5357) [CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357) 2024-06-08 01:59:20 -07:00			`with vllm_runner(model_name=model_name,`
			`revision=revision,`
			`dtype=dtype,`
			`quantization="marlin",`
			`max_model_len=MAX_MODEL_LEN,`
			`tensor_parallel_size=1) as gptq_marlin_model:`

			`gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(`
			`example_prompts[:-1], max_tokens, num_logprobs)`
[Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 21:55:29 +08:00			`_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00
			`# Run gptq.`
[Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 21:55:29 +08:00			`# The naive gptq kernel doesn't support bf16 yet.`
			`# Here we always compare fp16/bf16 gpt marlin kernel`
			`# to fp16 gptq kernel.`
[CI/Test] improve robustness of test (vllm_runner) (#5357) [CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357) 2024-06-08 01:59:20 -07:00			`with vllm_runner(model_name=model_name,`
			`revision=revision,`
			`dtype="half",`
			`quantization="gptq",`
			`max_model_len=MAX_MODEL_LEN,`
			`tensor_parallel_size=1) as gptq_model:`
			`gptq_outputs = gptq_model.generate_greedy_logprobs(`
			`example_prompts[:-1], max_tokens, num_logprobs)`
[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922) Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-29 12:35:34 -04:00
			`check_logprobs_close(`
			`outputs_0_lst=gptq_outputs,`
			`outputs_1_lst=gptq_marlin_outputs,`
			`name_0="gptq",`
			`name_1="gptq_marlin",`
			`)`