vllm/tests/v1/tpu/test_perf.py

# SPDX-License-Identifier: Apache-2.0
"""A basic performance regression test for TPUs

Run `pytest tests/v1/tpu/test_perf.py`.
"""
from __future__ import annotations

import time
from dataclasses import dataclass
from typing import TYPE_CHECKING

import numpy as np
import pytest

from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer

if TYPE_CHECKING:
    from tests.conftest import VllmRunner


@dataclass
class TestParams:
    model: str
    num_prompts: int
    prefix_len: int
    decode_len: int
    expected_avg_time: float
    err_tol: float


TEST_PARAMS = [
    # TODO: Cannot run a series of tests because:
    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
    #   Couldn't open iommu group /dev/vfio/0
    # => Investigate

    # TestParams(
    #     model="Qwen/Qwen2.5-1.5B-Instruct",
    #     num_prompts=1,
    #     prefix_len=10,
    #     decode_len=5,
    #     expected_avg_time=0.03,
    #     err_tol=0.01,
    # ),
    # TestParams(
    #     model="Qwen/Qwen2.5-1.5B-Instruct",
    #     num_prompts=10,
    #     prefix_len=100,
    #     decode_len=50,
    #     expected_avg_time=0.234,
    #     err_tol=0.020,
    # ),
    TestParams(
        model="Qwen/Qwen2.5-1.5B-Instruct",
        num_prompts=64,
        prefix_len=500,
        decode_len=50,

        # (This is the active CI/CD instance)
        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
        # tpu: v5lite (vllm CI/CD)
        expected_avg_time=1.4,
        err_tol=0.30,

        # (TODO: There is no v6e in CI/CD currently)
        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
        # tpu: v6e
        # expected_avg_time=1.5,
        # err_tol=0.20,
    ),
]

NUM_WARMUPS = 5
NUM_RUNS = 10

MAX_MODEL_LEN = 1024
MAX_NUM_SEQS = 32
GPU_UTIL = 0.9


@pytest.mark.skipif(not current_platform.is_tpu(),
                    reason="This is a basic performance test for TPU only")
@pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf(
    vllm_runner: type[VllmRunner],
    monkeypatch: pytest.MonkeyPatch,
    params: TestParams,
) -> None:
    tokenizer = get_tokenizer(params.model,
                              tokenizer_mode="auto",
                              trust_remote_code=True)

    prompts = []
    for i in range(params.num_prompts):
        prefix_token_ids = np.random.randint(0,
                                             tokenizer.vocab_size,
                                             size=params.prefix_len).tolist()
        prompt = tokenizer.decode(prefix_token_ids)
        prompts.append(prompt)

    print(
        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
            len(prompts), params.prefix_len, params.decode_len))

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")

        sampling_params = SamplingParams(max_tokens=params.decode_len,
                                         temperature=1.0,
                                         min_p=0.0)

        with vllm_runner(params.model,
                         max_num_batched_tokens=MAX_MODEL_LEN,
                         max_model_len=MAX_MODEL_LEN,
                         max_num_seqs=MAX_NUM_SEQS,
                         gpu_memory_utilization=GPU_UTIL,
                         enforce_eager=False,
                         tensor_parallel_size=1) as vllm_model:
            print("  -- Warmup / Compile")
            for i in range(NUM_WARMUPS):
                _ = vllm_model.generate(prompts, sampling_params)

            print("  -- Benchmarking... ")
            times = []
            for i in range(NUM_RUNS):
                start_time = time.time()
                _ = vllm_model.generate(prompts, sampling_params)
                times.append(time.time() - start_time)

            avg_time = sum(times) / len(times)

            print("  -- avg_time = {}".format(avg_time))
            print("  -- expected_avg_time = {} with err_tol = {}".format(
                params.expected_avg_time, params.err_tol))
            diff = avg_time - params.expected_avg_time
            ok = diff < params.err_tol
            if diff < -params.err_tol:
                print("  !! WARNING !! Performance has improved by {}, "
                      "it may be necessary to fine-tune the "
                      "expected_avg_time = {}".format(
                          -diff, params.expected_avg_time))

            assert ok, " !! ERROR !! Regression detected"
[V1] TPU CI - Add basic perf regression test (#15414) Signed-off-by: Alexander Matveev <amatveev@redhat.com> 2025-03-31 13:25:20 -04:00			`# SPDX-License-Identifier: Apache-2.0`
			`"""A basic performance regression test for TPUs`

			Run `pytest tests/v1/tpu/test_perf.py`.
			`"""`
			`from __future__ import annotations`

			`import time`
			`from dataclasses import dataclass`
			`from typing import TYPE_CHECKING`

			`import numpy as np`
			`import pytest`

			`from vllm.platforms import current_platform`
			`from vllm.sampling_params import SamplingParams`
			`from vllm.transformers_utils.tokenizer import get_tokenizer`

			`if TYPE_CHECKING:`
			`from tests.conftest import VllmRunner`


			`@dataclass`
			`class TestParams:`
			`model: str`
			`num_prompts: int`
			`prefix_len: int`
			`decode_len: int`
			`expected_avg_time: float`
			`err_tol: float`


			`TEST_PARAMS = [`
			`# TODO: Cannot run a series of tests because:`
			`# RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:`
			`# open(/dev/vfio/0): Device or resource busy: Device or resource busy;`
			`# Couldn't open iommu group /dev/vfio/0`
			`# => Investigate`

			`# TestParams(`
			`# model="Qwen/Qwen2.5-1.5B-Instruct",`
			`# num_prompts=1,`
			`# prefix_len=10,`
			`# decode_len=5,`
			`# expected_avg_time=0.03,`
			`# err_tol=0.01,`
			`# ),`
			`# TestParams(`
			`# model="Qwen/Qwen2.5-1.5B-Instruct",`
			`# num_prompts=10,`
			`# prefix_len=100,`
			`# decode_len=50,`
			`# expected_avg_time=0.234,`
			`# err_tol=0.020,`
			`# ),`
			`TestParams(`
			`model="Qwen/Qwen2.5-1.5B-Instruct",`
			`num_prompts=64,`
			`prefix_len=500,`
			`decode_len=50,`

			`# (This is the active CI/CD instance)`
			`# commit id: ccb246776d93ef105904a8ec015b3587240a1183`
			`# tpu: v5lite (vllm CI/CD)`
			`expected_avg_time=1.4,`
			`err_tol=0.30,`

			`# (TODO: There is no v6e in CI/CD currently)`
			`# commit id: ccb246776d93ef105904a8ec015b3587240a1183`
			`# tpu: v6e`
			`# expected_avg_time=1.5,`
			`# err_tol=0.20,`
			`),`
			`]`

			`NUM_WARMUPS = 5`
			`NUM_RUNS = 10`

			`MAX_MODEL_LEN = 1024`
			`MAX_NUM_SEQS = 32`
			`GPU_UTIL = 0.9`


			`@pytest.mark.skipif(not current_platform.is_tpu(),`
			`reason="This is a basic performance test for TPU only")`
			`@pytest.mark.parametrize("params", TEST_PARAMS)`
			`def test_perf(`
			`vllm_runner: type[VllmRunner],`
			`monkeypatch: pytest.MonkeyPatch,`
			`params: TestParams,`
			`) -> None:`
			`tokenizer = get_tokenizer(params.model,`
			`tokenizer_mode="auto",`
			`trust_remote_code=True)`

			`prompts = []`
			`for i in range(params.num_prompts):`
			`prefix_token_ids = np.random.randint(0,`
			`tokenizer.vocab_size,`
			`size=params.prefix_len).tolist()`
			`prompt = tokenizer.decode(prefix_token_ids)`
			`prompts.append(prompt)`

			`print(`
			`"-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(`
			`len(prompts), params.prefix_len, params.decode_len))`

			`with monkeypatch.context() as m:`
			`m.setenv("VLLM_USE_V1", "1")`

			`sampling_params = SamplingParams(max_tokens=params.decode_len,`
			`temperature=1.0,`
			`min_p=0.0)`

			`with vllm_runner(params.model,`
			`max_num_batched_tokens=MAX_MODEL_LEN,`
			`max_model_len=MAX_MODEL_LEN,`
			`max_num_seqs=MAX_NUM_SEQS,`
			`gpu_memory_utilization=GPU_UTIL,`
			`enforce_eager=False,`
			`tensor_parallel_size=1) as vllm_model:`
			`print(" -- Warmup / Compile")`
			`for i in range(NUM_WARMUPS):`
			`_ = vllm_model.generate(prompts, sampling_params)`

			`print(" -- Benchmarking... ")`
			`times = []`
			`for i in range(NUM_RUNS):`
			`start_time = time.time()`
			`_ = vllm_model.generate(prompts, sampling_params)`
			`times.append(time.time() - start_time)`

			`avg_time = sum(times) / len(times)`

			`print(" -- avg_time = {}".format(avg_time))`
			`print(" -- expected_avg_time = {} with err_tol = {}".format(`
			`params.expected_avg_time, params.err_tol))`
			`diff = avg_time - params.expected_avg_time`
			`ok = diff < params.err_tol`
			`if diff < -params.err_tol:`
			`print(" !! WARNING !! Performance has improved by {}, "`
			`"it may be necessary to fine-tune the "`
			`"expected_avg_time = {}".format(`
			`-diff, params.expected_avg_time))`

			`assert ok, " !! ERROR !! Regression detected"`