vllm/tests/models/test_transformers.py

# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend.

Run `pytest tests/models/test_transformers.py`.
"""
from contextlib import nullcontext
from typing import Type

import pytest

from ..conftest import HfRunner, VllmRunner
from ..utils import multi_gpu_test
from .utils import check_logprobs_close


def check_implementation(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    example_prompts: list[str],
    model: str,
    **kwargs,
):
    max_tokens = 32
    num_logprobs = 5

    with vllm_runner(model, **kwargs) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
        ("openai-community/gpt2", "transformers"),
        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
    ])  # trust_remote_code=True by default
def test_models(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:

    maybe_raises = nullcontext()
    if model == "openai-community/gpt2" and model_impl == "transformers":
        # Model is not backend compatible
        maybe_raises = pytest.raises(
            ValueError,
            match="The Transformers implementation.*not compatible with vLLM")

    with maybe_raises:
        check_implementation(hf_runner,
                             vllm_runner,
                             example_prompts,
                             model,
                             model_impl=model_impl)


@multi_gpu_test(num_gpus=2)
def test_distributed(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
    check_implementation(hf_runner, vllm_runner, example_prompts,
                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)


@pytest.mark.parametrize("model, quantization_kwargs", [
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
            "quantization": "bitsandbytes",
            "load_format": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
    vllm_runner: Type[VllmRunner],
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
[Misc] Fix improper placement of SPDX header in scripts (#12694) Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-03 14:16:59 -05:00			`# SPDX-License-Identifier: Apache-2.0`
[Model]: Add `transformers` backend support (#11330) # Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-02-03 14:30:38 +01:00			`"""Test the functionality of the Transformers backend.`

			Run `pytest tests/models/test_transformers.py`.
			`"""`
			`from contextlib import nullcontext`
			`from typing import Type`

			`import pytest`

			`from ..conftest import HfRunner, VllmRunner`
			`from ..utils import multi_gpu_test`
			`from .utils import check_logprobs_close`


			`def check_implementation(`
			`hf_runner: Type[HfRunner],`
			`vllm_runner: Type[VllmRunner],`
			`example_prompts: list[str],`
			`model: str,`
			`**kwargs,`
			`):`
			`max_tokens = 32`
			`num_logprobs = 5`

			`with vllm_runner(model, **kwargs) as vllm_model:`
			`vllm_outputs = vllm_model.generate_greedy_logprobs(`
			`example_prompts, max_tokens, num_logprobs)`

			`with hf_runner(model) as hf_model:`
			`hf_outputs = hf_model.generate_greedy_logprobs_limit(`
			`example_prompts, max_tokens, num_logprobs)`

			`check_logprobs_close(`
			`outputs_0_lst=hf_outputs,`
			`outputs_1_lst=vllm_outputs,`
			`name_0="hf",`
			`name_1="vllm",`
			`)`


			`@pytest.mark.parametrize(`
			`"model,model_impl",`
			`[`
			`("meta-llama/Llama-3.2-1B-Instruct", "transformers"),`
			`("openai-community/gpt2", "transformers"),`
			`("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE`
			`]) # trust_remote_code=True by default`
[Model] Enable quantization support for `transformers` backend (#12960) 2025-02-18 11:52:47 +08:00			`def test_models(`
			`hf_runner: Type[HfRunner],`
			`vllm_runner: Type[VllmRunner],`
			`example_prompts: list[str],`
			`model: str,`
			`model_impl: str,`
			`) -> None:`
[Model]: Add `transformers` backend support (#11330) # Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-02-03 14:30:38 +01:00
			`maybe_raises = nullcontext()`
			`if model == "openai-community/gpt2" and model_impl == "transformers":`
			`# Model is not backend compatible`
			`maybe_raises = pytest.raises(`
			`ValueError,`
			`match="The Transformers implementation.*not compatible with vLLM")`

			`with maybe_raises:`
			`check_implementation(hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model,`
			`model_impl=model_impl)`


			`@multi_gpu_test(num_gpus=2)`
			`def test_distributed(`
[Model] Enable quantization support for `transformers` backend (#12960) 2025-02-18 11:52:47 +08:00			`hf_runner: Type[HfRunner],`
			`vllm_runner: Type[VllmRunner],`
[Model]: Add `transformers` backend support (#11330) # Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-02-03 14:30:38 +01:00			`example_prompts,`
			`):`
			`kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}`
			`check_implementation(hf_runner, vllm_runner, example_prompts,`
			`"meta-llama/Llama-3.2-1B-Instruct", **kwargs)`
[Model] Enable quantization support for `transformers` backend (#12960) 2025-02-18 11:52:47 +08:00

			`@pytest.mark.parametrize("model, quantization_kwargs", [`
			`(`
			`"meta-llama/Llama-3.2-1B-Instruct",`
			`{`
			`"quantization": "bitsandbytes",`
			`"load_format": "bitsandbytes",`
			`},`
			`),`
			`])`
			`@pytest.mark.parametrize("max_tokens", [32])`
			`@pytest.mark.parametrize("num_logprobs", [5])`
			`def test_quantization(`
			`vllm_runner: Type[VllmRunner],`
			`example_prompts: list[str],`
			`model: str,`
			`quantization_kwargs: dict[str, str],`
			`max_tokens: int,`
			`num_logprobs: int,`
			`) -> None:`
			`with vllm_runner(`
			`model, model_impl="auto", enforce_eager=True,`
			`**quantization_kwargs) as vllm_model: # type: ignore[arg-type]`
			`vllm_outputs = vllm_model.generate_greedy_logprobs(`
			`example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)`

			`with vllm_runner(`
			`model,`
			`model_impl="transformers",`
			`enforce_eager=True,`
			`**quantization_kwargs) as vllm_model: # type: ignore[arg-type]`
			`transformers_outputs = vllm_model.generate_greedy_logprobs(`
			`example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)`
			`check_logprobs_close(`
			`outputs_0_lst=transformers_outputs,`
			`outputs_1_lst=vllm_outputs,`
			`name_0="transformers",`
			`name_1="vllm",`
			`)`