vllm/tests/models/test_fp8.py

# flake8: noqa
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import os

import pytest
import torch
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024

MODELS = [
    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
    "meta-llama/Meta-Llama-3-8B-Instruct",
]

EXPECTED_STRS_MAP = {
    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
        'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
        'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
    ],
    "meta-llama/Meta-Llama-3-8B-Instruct": [
        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
        'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
    ],
}

capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
fp8_not_supported = (capability <
                     QUANTIZATION_METHODS["fp8"].get_min_capability())


@pytest.mark.skipif(fp8_not_supported,
                    reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(
    example_prompts,
    model_name,
) -> None:
    model = LLM(model=model_name,
                max_model_len=MAX_MODEL_LEN,
                enforce_eager=True,
                quantization="fp8")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
        tokenizer.apply_chat_template([{
            "role": "user",
            "content": prompt
        }],
                                      tokenize=False,
                                      add_generation_prompt=True)
        for prompt in example_prompts
    ]

    params = SamplingParams(max_tokens=20, temperature=0)
    generations = []
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
        outputs = model.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
    del model

    print(generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]
    for i in range(len(example_prompts)):
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
[Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-04-30 17:46:12 -04:00			`# flake8: noqa`
			`"""Tests fp8 models against ground truth generation`
			`Note: these tests will only pass on L4 GPU.`
			`"""`
			`import os`

			`import pytest`
			`import torch`
			`from transformers import AutoTokenizer`

			`from vllm import LLM, SamplingParams`
			`from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS`

			`os.environ["TOKENIZERS_PARALLELISM"] = "true"`

			`MAX_MODEL_LEN = 1024`

			`MODELS = [`
			`"nm-testing/Meta-Llama-3-8B-Instruct-FP8",`
			`"meta-llama/Meta-Llama-3-8B-Instruct",`
			`]`

			`EXPECTED_STRS_MAP = {`
			`"nm-testing/Meta-Llama-3-8B-Instruct-FP8": [`
			`'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',`
			`'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',`
			`'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',`
[Kernel] Add flash-attn back (#4907) 2024-05-19 18:11:30 -07:00			`'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',`
			`'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',`
			`'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',`
[Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-04-30 17:46:12 -04:00			`'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',`
[Kernel] Add flash-attn back (#4907) 2024-05-19 18:11:30 -07:00			`'Here are the translations:\n\nJapanese: (Haya aki no tori, guri o',`
[Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-04-30 17:46:12 -04:00			`],`
			`"meta-llama/Meta-Llama-3-8B-Instruct": [`
			`'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',`
			`'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',`
			`'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',`
			`'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',`
[Kernel] Add flash-attn back (#4907) 2024-05-19 18:11:30 -07:00			`'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',`
[Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> 2024-04-30 17:46:12 -04:00			`'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',`
			`'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',`
			`'Here are the translations:\n\nJapanese: (Haya aki wa mushi o tsukamu'`
			`],`
			`}`

			`capability = torch.cuda.get_device_capability()`
			`capability = capability[0] * 10 + capability[1]`
			`fp8_not_supported = (capability <`
			`QUANTIZATION_METHODS["fp8"].get_min_capability())`


			`@pytest.mark.skipif(fp8_not_supported,`
			`reason="fp8 is not supported on this GPU type.")`
			`@pytest.mark.parametrize("model_name", MODELS)`
			`def test_models(`
			`example_prompts,`
			`model_name,`
			`) -> None:`
			`model = LLM(model=model_name,`
			`max_model_len=MAX_MODEL_LEN,`
			`enforce_eager=True,`
			`quantization="fp8")`

			`tokenizer = AutoTokenizer.from_pretrained(model_name)`
			`formatted_prompts = [`
			`tokenizer.apply_chat_template([{`
			`"role": "user",`
			`"content": prompt`
			`}],`
			`tokenize=False,`
			`add_generation_prompt=True)`
			`for prompt in example_prompts`
			`]`

			`params = SamplingParams(max_tokens=20, temperature=0)`
			`generations = []`
			`# Note: these need to be run 1 at a time due to numerical precision,`
			`# since the expected strs were generated this way.`
			`for prompt in formatted_prompts:`
			`outputs = model.generate(prompt, params)`
			`generations.append(outputs[0].outputs[0].text)`
			`del model`

			`print(generations)`
			`expected_strs = EXPECTED_STRS_MAP[model_name]`
			`for i in range(len(example_prompts)):`
			`generated_str = generations[i]`
			`expected_str = expected_strs[i]`
			`assert expected_str == generated_str, (`
			`f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")`