[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Sibi 2025-03-17 11:35:57 +08:00 committed by GitHub
parent 1e799b7ec1
commit a73e183e36
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 1900 additions and 1658 deletions

View File

@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"

View File

@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
@ -63,31 +64,33 @@ def test_models(
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window # we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join( prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:" str(i) for i in range(1024)) + " are:"
example_prompts = [prompt] example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) ])
def test_models_distributed( def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
@ -116,34 +120,41 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE: if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") pytest.skip(f"Skip test for {test_suite}")
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa with monkeypatch.context() as monkeypatch_context:
# test Ray Compiled Graph if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend: if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method
with vllm_runner(model, # (the default method).
dtype=dtype, with vllm_runner(
tensor_parallel_size=2, model,
distributed_executor_backend=distributed_executor_backend dtype=dtype,
) as vllm_model: tensor_parallel_size=2,
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )

View File

@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file. all tests in the file.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -52,37 +60,39 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@ -90,57 +100,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if (model == "meta-llama/Llama-3.2-1B-Instruct" dtype = "half"
and distributed_executor_backend == "ray"): max_tokens = 5
# test Ray Compiled Graph chunked_prefill_token_size = 16
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
dtype = "half" # Add a chunked prefill config.
max_tokens = 5 max_num_seqs = min(chunked_prefill_token_size, 256)
chunked_prefill_token_size = 16 assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# Add a chunked prefill config. # NOTE: take care of the order. run vLLM first, and then run HF.
max_num_seqs = min(chunked_prefill_token_size, 256) # vLLM needs a fresh new process without cuda initialization.
assert chunked_prefill_token_size != -1 # if we run HF first, the cuda initialization will be done and it
enable_chunked_prefill = True # will hurt multiprocessing backend with
max_num_batched_tokens = chunked_prefill_token_size # fork method (the default method).
# NOTE: take care of the order. run vLLM first, and then run HF. with vllm_runner(
# vLLM needs a fresh new process without cuda initialization. model,
# if we run HF first, the cuda initialization will be done and it dtype=dtype,
# will hurt multiprocessing backend with fork method (the default method). tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with vllm_runner( with hf_runner(model, dtype=dtype) as hf_model:
model, hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: check_outputs_equal(
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
check_outputs_equal( name_0="hf",
outputs_0_lst=hf_outputs, name_1="vllm",
outputs_1_lst=vllm_outputs, )
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True]) @pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache( def test_models_with_fp8_kv_cache(
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
kv_cache_dtype: str, kv_cache_dtype: str,
model: str, model: str,
@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt], outputs[enable] += vllm_model.generate_greedy(
max_tokens) [prompt],
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,

View File

@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
import os with monkeypatch.context() as m:
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True) llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?" prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params) output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only # which is difficult to measure in the test. therefore, we only
# test sleep level 1 here. # test sleep level 1 here.
llm.sleep(level=1) llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool, # now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights) # and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug, # is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected. # therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1. # in V1.
if use_v1: if use_v1:
assert used_bytes < 7 * GiB_bytes assert used_bytes < 7 * GiB_bytes
else: else:
assert used_bytes < 2 * GiB_bytes assert used_bytes < 2 * GiB_bytes
llm.wake_up() llm.wake_up()
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Optional
import pytest import pytest
@ -22,75 +22,76 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing
test_settings = [
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product # we cannot afford testing the full Catesian product
# of all models and all levels # of all models and all levels
@pytest.mark.parametrize("test_setting", test_settings) @pytest.mark.parametrize(
def test_compile_correctness(test_setting: TestSetting): "test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
all_args: list[list[str]] = [] with monkeypatch.context() as m:
all_envs: list[Optional[dict[str, str]]] = [] m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
for level in [ all_args: list[list[str]] = []
CompilationLevel.NO_COMPILATION, all_envs: list[dict[str, str] | None] = []
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
# inductor will change the output, so we only compare if the output for level in [
# is close, not exactly the same. CompilationLevel.NO_COMPILATION,
compare_all_settings( CompilationLevel.PIECEWISE,
model, ]:
all_args, all_args.append(final_args + [f"-O{level}"])
all_envs, all_envs.append({})
method=method if method != "generate" else "generate_close")
all_envs.clear()
all_args.clear()
for level in [ # inductor will change the output, so we only compare if the output
CompilationLevel.NO_COMPILATION, # is close, not exactly the same.
CompilationLevel.DYNAMO_AS_IS, compare_all_settings(
CompilationLevel.DYNAMO_ONCE, model,
]: all_args,
all_args.append(final_args + [f"-O{level}"]) all_envs,
all_envs.append({}) method=method if method != "generate" else "generate_close")
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: all_envs.clear()
# "DYNAMO_ONCE" will always use fullgraph all_args.clear()
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method) for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method)

View File

@ -1,22 +1,115 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest from __future__ import annotations
from typing import Any
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level): def test_full_graph(
model = model_info[0] monkeypatch: pytest.MonkeyPatch,
model_kwargs = model_info[1] model_info: tuple[str, dict[str, Any]],
check_full_graph_support(model, optimization_level: int,
model_kwargs, ):
optimization_level, model, model_kwargs = model_info
tp_size=1)
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -1,93 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
from __future__ import annotations
from typing import Any, Callable
import pytest import pytest
import ray import ray
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, def all_reduce_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, def all_gather_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def broadcast_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel(tp_size, test_target): def test_multi_process_tensor_parallel(
multi_process_parallel(tp_size, 1, test_target) monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel(pp_size, test_target): def test_multi_process_pipeline_parallel(
multi_process_parallel(1, pp_size, test_target) monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel_pipeline_parallel( def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size, pp_size, test_target): tp_size: int,
multi_process_parallel(tp_size, pp_size, test_target) pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import random import random
import pytest import pytest
@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): def graph_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
device = torch.device(f"cuda:{rank}") tp_size,
torch.cuda.set_device(device) pp_size,
init_test_distributed_environment(tp_size, pp_size, rank, rank,
distributed_init_port) distributed_init_port,
ensure_model_parallel_initialized(tp_size, pp_size) ):
group = get_tensor_model_parallel_group().device_group with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tensor_model_parallel_group().device_group
# A small all_reduce for warmup. # A small all_reduce for warmup.
# this is needed because device communicators might be created lazily # this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized # (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for # before any communication happens, so that this group can be used for
# graph capture immediately. # graph capture immediately.
data = torch.zeros(1) data = torch.zeros(1)
data = data.to(device=device) data = data.to(device=device)
torch.distributed.all_reduce(data, group=group) torch.distributed.all_reduce(data, group=group)
torch.cuda.synchronize() torch.cuda.synchronize()
del data del data
# we use the first group to communicate once # we use the first group to communicate once
# and the second group to communicate twice # and the second group to communicate twice
# and so on # and so on
# this is used to demonstrate that each group can # this is used to demonstrate that each group can
# communicate independently # communicate independently
num_communication = rank // tp_size + 1 num_communication = rank // tp_size + 1
for sz in test_sizes: for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]: for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context: with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly # use integers so result matches NCCL exactly
inp1 = torch.randint(1, inp1 = torch.randint(1,
16, (sz, ), 16, (sz, ),
dtype=dtype, dtype=dtype,
device=torch.cuda.current_device()) device=torch.cuda.current_device())
inp2 = torch.randint(1, inp2 = torch.randint(1,
16, (sz, ), 16, (sz, ),
dtype=dtype, dtype=dtype,
device=torch.cuda.current_device()) device=torch.cuda.current_device())
torch.cuda.synchronize() torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, with torch.cuda.graph(graph,
stream=graph_capture_context.stream): stream=graph_capture_context.stream):
for i in range(num_communication): for i in range(num_communication):
out1 = tensor_model_parallel_all_reduce(inp1) out1 = tensor_model_parallel_all_reduce(inp1)
# the input buffer is immediately modified to test # the input buffer is immediately modified to test
# synchronization # synchronization
dist.all_reduce(inp1, group=group) dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2) out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group) dist.all_reduce(inp2, group=group)
graph.replay() graph.replay()
torch.testing.assert_close(out1, inp1) torch.testing.assert_close(out1, inp1)
torch.testing.assert_close(out2, inp2) torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): def eager_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
device = torch.device(f"cuda:{rank}") tp_size,
torch.cuda.set_device(device) pp_size,
init_test_distributed_environment(tp_size, pp_size, rank, rank,
distributed_init_port) distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
# we use the first group to communicate once # we use the first group to communicate once
# and the second group to communicate twice # and the second group to communicate twice
# and so on # and so on
# this is used to demonstrate that each group can # this is used to demonstrate that each group can
# communicate independently # communicate independently
num_communication = rank // tp_size + 1 num_communication = rank // tp_size + 1
sz = 1024 sz = 1024
fa = get_tp_group().ca_comm fa = get_tp_group().ca_comm
inp = torch.ones(sz, dtype=torch.float32, device=device) inp = torch.ones(sz, dtype=torch.float32, device=device)
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce(out, registered=False) out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce(out, registered=False) out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(tp_size, pipeline_parallel_size, test_target) multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
test_target)

View File

@ -7,33 +7,35 @@ import pytest
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(): def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
def _verify(partition_str, num_layers, pp_size, goldens): with monkeypatch.context() as m:
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
for pp_rank, golden in enumerate(goldens):
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None:
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
# Even partition def _verify(partition_str, num_layers, pp_size, goldens):
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
# Balanced partition m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) for pp_rank, golden in enumerate(goldens):
# Put reminder somewhere assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) if bak is not None:
# Invalid partition strings m.setenv("VLLM_PP_LAYER_PARTITION", bak)
with pytest.raises(ValueError):
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Even partition
with pytest.raises(ValueError): _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Balanced partition
# Wrong number of partitions _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
with pytest.raises(ValueError): # Put reminder somewhere
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
# Wrong number of layers # Invalid partition strings
with pytest.raises(ValueError): with pytest.raises(ValueError):
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
with pytest.raises(ValueError):
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of partitions
with pytest.raises(ValueError):
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of layers
with pytest.raises(ValueError):
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)), (5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)), (5, 3, 2, (4, 5)),
]) ])
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, def test_uneven_auto_partition(
pp_rank: int, indices: tuple[int, int]): num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)

View File

@ -1,11 +1,15 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os from typing import TYPE_CHECKING
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"), (2, "JackFram/llama-160m"),
@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER", "FLASHINFER",
]) ])
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(
cudagraph_args = [ monkeypatch: pytest.MonkeyPatch,
# use half precision for speed and memory savings in CI environment PP_SIZE: int,
"--dtype", MODEL_NAME: str,
"float16", ATTN_BACKEND: LiteralString,
"--pipeline-parallel-size", ):
str(PP_SIZE), with monkeypatch.context() as m:
"--distributed-executor-backend", cudagraph_args = [
"mp", # use half precision for speed and memory savings in CI environment
] "--dtype",
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND "float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
]
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"] eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)

View File

@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU") reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args) run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:

View File

@ -53,32 +53,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models") @pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch): def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
try: with monkeypatch.context() as m:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") try:
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs): def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed") raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", m.setattr(
disable_connect) urllib3.connection.HTTPConnection,
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", "connect",
disable_connect) disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub and friends to setup offline mode # Need to re-import huggingface_hub
_re_import_modules() # and friends to setup offline mode
# Cached model files should be used in offline mode _re_import_modules()
for model_config in MODEL_CONFIGS: # Cached model files should be used in offline mode
LLM(**model_config) for model_config in MODEL_CONFIGS:
finally: LLM(**model_config)
# Reset the environment after the test finally:
# NB: Assuming tests are run in online mode # Reset the environment after the test
monkeypatch.delenv("HF_HUB_OFFLINE") # NB: Assuming tests are run in online mode
monkeypatch.delenv("VLLM_NO_USAGE_STATS") _re_import_modules()
_re_import_modules()
pass
def _re_import_modules(): def _re_import_modules():

View File

@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU") reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:

View File

@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@ -25,87 +24,111 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, use_v1: bool, device: str, monkeypatch): def test_env(
name: str,
use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m:
override_backend_env_variable(monkeypatch, name) m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name)
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED
elif device == "openvino":
with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False)
assert backend.get_name() == "OPENVINO"
else:
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
CudaPlatform()): CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16, torch.float16,
16, False) 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform",
RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
elif device == "openvino":
with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "OPENVINO"
else:
if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
backend = get_attn_backend(16, torch.float16,
torch.float16, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to # TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend # get_attn_backend
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=(7, 5)): monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
(7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported data type # Reset the monkeypatch for subsequent tests
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) monkeypatch.undo()
assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported kv cache data type # Unsupported data type
backend = get_attn_backend(16, torch.float16, "fp8", 16, False) backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported block size # Unsupported kv cache data type
backend = get_attn_backend(16, torch.float16, None, 8, False) backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed # Unsupported block size
with patch.dict('sys.modules', {'vllm_flash_attn': None}): backend = get_attn_backend(16, torch.float16, None, 8, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed
import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Unsupported head size # Restore the original module if it existed
backend = get_attn_backend(17, torch.float16, None, 16, False) if original_module is not None:
assert backend.get_name() != STR_FLASH_ATTN_VAL monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
# Attention-free models should bypass env and use PlaceholderAttention # Unsupported head size
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) backend = get_attn_backend(17, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Attention-free models should bypass env and use PlaceholderAttention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
assert backend.get_name() != STR_FLASH_ATTN_VAL
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch): def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
"""Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()): with monkeypatch.context() as m, patch(
"vllm.attention.selector.current_platform", CudaPlatform()):
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
# Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False) backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck(): def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
qweight = torch.randint(-2000000000, m.setenv("VLLM_USE_TRITON_AWQ", "0")
2000000000, (8192, 256), qweight = torch.randint(-2000000000,
device='cuda', 2000000000, (8192, 256),
dtype=torch.int32) device='cuda',
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) dtype=torch.int32)
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
split_k_iters = 0 zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
thx = 0 split_k_iters = 0
thy = 0 thx = 0
opcheck(torch.ops._C.awq_dequantize, thy = 0
(qweight, scales, zeros, split_k_iters, thx, thy)) opcheck(torch.ops._C.awq_dequantize,
(qweight, scales, zeros, split_k_iters, thx, thy))
@pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck(): def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) m.setenv("VLLM_USE_TRITON_AWQ", "0")
qweight = torch.randint(-2000000000, input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
2000000000, (8192, 256), qweight = torch.randint(-2000000000,
device='cuda', 2000000000, (8192, 256),
dtype=torch.int32) device='cuda',
scales = torch.randint(-2000000000, dtype=torch.int32)
2000000000, (64, 256), scales = torch.randint(-2000000000,
device='cuda', 2000000000, (64, 256),
dtype=torch.int32) device='cuda',
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) dtype=torch.int32)
split_k_iters = 8 qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
opcheck(torch.ops._C.awq_gemm, split_k_iters = 8
(input, qweight, qzeros, scales, split_k_iters)) opcheck(torch.ops._C.awq_gemm,
(input, qweight, qzeros, scales, split_k_iters))

View File

@ -1,13 +1,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
def test_selector(monkeypatch): def test_selector(monkeypatch: pytest.MonkeyPatch):
"""Test that the attention selector for ROCm. with monkeypatch.context() as m:
""" m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
with patch("vllm.attention.selector.current_platform", RocmPlatform()): # Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform",
RocmPlatform())
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH" assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1") or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related # mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True) False, True)

View File

@ -12,11 +12,10 @@ import pytest
from tests.kernels.utils import override_backend_env_variable from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model @pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@ -55,45 +54,47 @@ def test_models(
backend: str, backend: str,
tensor_parallel_size: int, tensor_parallel_size: int,
disable_async_output_proc: bool, disable_async_output_proc: bool,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Only checks log probs match to cover the discrepancy in Only checks log probs match to cover the discrepancy in
numerical sensitive kernels. numerical sensitive kernels.
""" """
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8 NUM_LOG_PROBS = 8
with vllm_runner( with vllm_runner(
base_model, base_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
kv_cache_dtype="auto", kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs( baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner( with vllm_runner(
test_model, test_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs( test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=baseline_outputs, outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs, outputs_1_lst=test_outputs,
name_0="fp16_kv_cache", name_0="fp16_kv_cache",
name_1="fp8_kv_cache", name_1="fp8_kv_cache",
) )
@pytest.mark.cpu_model @pytest.mark.cpu_model
@ -119,38 +120,41 @@ def test_cpu_models(
test_model: str, test_model: str,
max_tokens: int, max_tokens: int,
disable_async_output_proc: bool, disable_async_output_proc: bool,
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Only checks log probs match to cover the discrepancy in Only checks log probs match to cover the discrepancy in
numerical sensitive kernels. numerical sensitive kernels.
""" """
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8 NUM_LOG_PROBS = 8
with vllm_runner( with vllm_runner(
base_model, base_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
dtype="bfloat16", dtype="bfloat16",
kv_cache_dtype="auto", kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs( baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner( with vllm_runner(
test_model, test_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
dtype="bfloat16", dtype="bfloat16",
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs( test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=baseline_outputs, outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs, outputs_1_lst=test_outputs,
name_0="bf16_kv_cache", name_0="bf16_kv_cache",
name_1="fp8_kv_cache", name_1="fp8_kv_cache",
) )

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import importlib.util import importlib.util
import math import math
@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
import vllm import vllm
import vllm.config import vllm.config
from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
@ -29,36 +31,34 @@ def _arr(arr):
return array("i", arr) return array("i", arr)
def test_find_array(monkeypatch): def test_find_array(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler from vllm.model_executor.models.gritlm import GritLMPooler
# Create an LLM object to get the model config. # Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config) pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server_embedding(): def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
with pytest.MonkeyPatch.context() as mp: args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
@ -69,9 +69,12 @@ def server_generate():
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def client_embedding(server_embedding: RemoteOpenAIServer): async def client_embedding(monkeypatch: pytest.MonkeyPatch,
async with server_embedding.get_async_client() as async_client: server_embedding: RemoteOpenAIServer):
yield async_client with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
async with server_embedding.get_async_client() as async_client:
yield async_client
@pytest_asyncio.fixture @pytest_asyncio.fixture
@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
yield async_client yield async_client
def run_llm_encode(llm: vllm.LLM, queries: list[str], def run_llm_encode(
instruction: str) -> list[float]: llm: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = llm.encode([instruction + q for q in queries], ) outputs = llm.encode([instruction + q for q in queries], )
return [output.outputs.embedding for output in outputs] return [output.outputs.embedding for output in outputs]
async def run_client_embeddings(client: vllm.LLM, queries: list[str], async def run_client_embeddings(
instruction: str) -> list[float]: client: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = await client.embeddings.create( outputs = await client.embeddings.create(
model=MODEL_NAME, model=MODEL_NAME,
input=[instruction + q for q in queries], input=[instruction + q for q in queries],
@ -106,7 +115,7 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm README.md in https://github.com/ContextualAI/gritlm
""" """
q_instruction = gritlm_instruction( q_instruction = gritlm_instruction(
"Given a scientific paper title, retrieve the paper's abstract") "Given a scientific paper title, retrieve the paper's abstract", )
queries = [ queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System", "Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning", "Generative Representational Instruction Tuning",
@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch): def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
d_rep = run_llm_encode( d_rep = run_llm_encode(
llm, llm,
documents, documents,
d_instruction, d_instruction,
) )
q_rep = run_llm_encode( q_rep = run_llm_encode(
llm, llm,
queries, queries,
q_instruction, q_instruction,
) )
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_embedding( async def test_gritlm_api_server_embedding(
client_embedding: openai.AsyncOpenAI): client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings( d_rep = await run_client_embeddings(

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_plugin(dummy_opt_path, monkeypatch): def test_plugin(
monkeypatch: pytest.MonkeyPatch,
dummy_opt_path: str,
):
# V1 shuts down rather than raising an error here. # V1 shuts down rather than raising an error here.
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
os.environ["VLLM_PLUGINS"] = "" m.setenv("VLLM_USE_V1", "0")
with pytest.raises(Exception) as excinfo: m.setenv("VLLM_PLUGINS", "")
LLM(model=dummy_opt_path, load_format="dummy")
error_msg = "has no vLLM implementation and " \ with pytest.raises(Exception) as excinfo:
"the Transformers implementation is not compatible with vLLM" LLM(model=dummy_opt_path, load_format="dummy")
assert (error_msg in str(excinfo.value)) error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501
assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_text_generation(dummy_opt_path): def test_oot_registration_text_generation(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
prompts = ["Hello, my name is", "The text does not matter"] dummy_opt_path: str,
sampling_params = SamplingParams(temperature=0) ):
llm = LLM(model=dummy_opt_path, load_format="dummy") with monkeypatch.context() as m:
first_token = llm.get_tokenizer().decode(0) m.setenv("VLLM_PLUGINS", "register_dummy_model")
outputs = llm.generate(prompts, sampling_params) prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_opt_path, load_format="dummy")
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
# make sure only the first token is generated # make sure only the first token is generated
rest = generated_text.replace(first_token, "") rest = generated_text.replace(first_token, "")
assert rest == "" assert rest == ""
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_embedding(dummy_gemma2_embedding_path): def test_oot_registration_embedding(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
prompts = ["Hello, my name is", "The text does not matter"] dummy_gemma2_embedding_path: str,
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") ):
outputs = llm.embed(prompts) with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
outputs = llm.embed(prompts)
for output in outputs: for output in outputs:
assert all(v == 0 for v in output.outputs.embedding) assert all(v == 0 for v in output.outputs.embedding)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB") image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): def test_oot_registration_multimodal(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
prompts = [{ dummy_llava_path: str,
"prompt": "What's in the image?<image>", ):
"multi_modal_data": { with monkeypatch.context() as m:
"image": image m.setenv("VLLM_PLUGINS", "register_dummy_model")
}, prompts = [{
}, { "prompt": "What's in the image?<image>",
"prompt": "Describe the image<image>", "multi_modal_data": {
"multi_modal_data": { "image": image
"image": image },
}, }, {
}] "prompt": "Describe the image<image>",
"multi_modal_data": {
"image": image
},
}]
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_llava_path, llm = LLM(model=dummy_llava_path,
load_format="dummy", load_format="dummy",
max_num_seqs=1, max_num_seqs=1,
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.98, gpu_memory_utilization=0.98,
max_model_len=4096, max_model_len=4096,
enforce_eager=True, enforce_eager=True,
limit_mm_per_prompt={"image": 1}) limit_mm_per_prompt={"image": 1})
first_token = llm.get_tokenizer().decode(0) first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
# make sure only the first token is generated # make sure only the first token is generated
rest = generated_text.replace(first_token, "") rest = generated_text.replace(first_token, "")
assert rest == "" assert rest == ""

View File

@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mp_crash_detection(monkeypatch): async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") parser = FlexibleArgumentParser(
parser = make_arg_parser(parser) description="vLLM's remote OpenAI server.")
args = parser.parse_args([]) parser = make_arg_parser(parser)
args = parser.parse_args([])
# When LLMEngine is loaded, it will crash. # When LLMEngine is loaded, it will crash.
def mock_init(): def mock_init():
raise ValueError raise ValueError
monkeypatch.setattr(LLMEngine, "__init__", mock_init) m.setattr(LLMEngine, "__init__", mock_init)
start = time.perf_counter() start = time.perf_counter()
async with build_async_engine_client(args): async with build_async_engine_client(args):
pass pass
end = time.perf_counter() end = time.perf_counter()
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " assert end - start < 60, (
"if there is an error in the startup.") "Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup.")
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@ -5,7 +5,7 @@ from typing import Optional
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close from ..models.utils import check_logprobs_close
from ..utils import (completions_with_server_args, get_client_text_generations, from ..utils import (completions_with_server_args, get_client_text_generations,
@ -52,7 +52,7 @@ async def test_multi_step(
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
enable_chunked_prefill: bool, enable_chunked_prefill: bool,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment. client/server environment.
@ -82,67 +82,70 @@ async def test_multi_step(
pytest.skip("Multi-step with Chunked-Prefill only supports" pytest.skip("Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend") "PP=1 and FLASH_ATTN backend")
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1) prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts] prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts assert len(prompts) == num_prompts
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
ms_server_args = DEFAULT_SERVER_ARGS + \ ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"] ["--num-scheduler-steps", f"{num_scheduler_steps}"]
if not is_async: if not is_async:
ms_server_args += ["--disable-async-output-proc"] ms_server_args += ["--disable-async-output-proc"]
if eager_mode: if eager_mode:
ms_server_args.append("--enforce-eager") ms_server_args.append("--enforce-eager")
if enable_chunked_prefill: if enable_chunked_prefill:
ms_server_args.append("--enable-chunked-prefill") ms_server_args.append("--enable-chunked-prefill")
distributed_args = [ distributed_args = [
"--tensor-parallel-size", "--tensor-parallel-size",
str(tp_size), str(tp_size),
"--pipeline-parallel-size", "--pipeline-parallel-size",
str(pp_size), str(pp_size),
] ]
# Spin up client/server & issue completion API requests. # Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically # Default `max_wait_seconds` is 240 but was empirically
# was raised 5x to 1200 *just for this test* due to # was raised 5x to 1200 *just for this test* due to
# observed timeouts in GHA CI # observed timeouts in GHA CI
ref_completions = await completions_with_server_args( ref_completions = await completions_with_server_args(
prompts, prompts,
model, model,
server_args + distributed_args, server_args + distributed_args,
num_logprobs, num_logprobs,
max_wait_seconds=5 * 240) max_wait_seconds=5 * 240)
test_completions = await completions_with_server_args( test_completions = await completions_with_server_args(
prompts, prompts,
model, model,
ms_server_args + distributed_args, ms_server_args + distributed_args,
num_logprobs, num_logprobs,
max_wait_seconds=5 * 240) max_wait_seconds=5 * 240)
# Assert multi-step scheduling produces identical tokens # Assert multi-step scheduling produces identical tokens
# to single-step scheduling. # to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions) ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions) test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations assert ref_generations == test_generations
# Assert multi-step scheduling produces nearly-identical logprobs # Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling. # to single-step scheduling.
ref_text_logprobs = get_client_text_logprob_generations(ref_completions) ref_text_logprobs = get_client_text_logprob_generations(
test_text_logprobs = get_client_text_logprob_generations(test_completions) ref_completions)
check_logprobs_close( test_text_logprobs = get_client_text_logprob_generations(
outputs_0_lst=ref_text_logprobs, test_completions)
outputs_1_lst=test_text_logprobs, check_logprobs_close(
name_0="hf", outputs_0_lst=ref_text_logprobs,
name_1="vllm", outputs_1_lst=test_text_logprobs,
) name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize(("tp_size, pp_size"), [ @pytest.mark.parametrize(("tp_size, pp_size"), [
@ -152,7 +155,7 @@ async def test_multi_step(
async def test_multi_step_pp_smoke( async def test_multi_step_pp_smoke(
tp_size: int, tp_size: int,
pp_size: int, pp_size: int,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Smoke test for the vLLM engine with multi-step scheduling in an Smoke test for the vLLM engine with multi-step scheduling in an
@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
attention_backend = "FLASH_ATTN" attention_backend = "FLASH_ATTN"
max_num_seqs = 3 max_num_seqs = 3
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
# Prompt from the ShareGPT dataset # Prompt from the ShareGPT dataset
prompts = [ prompts = [
"in the jtbd context whats a push?", # codespell:ignore "in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore "in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore "in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore "in the jtbd context whats a push?", # codespell:ignore
] ]
# Use varying max_tokens to introduce scheduling randomness. # Use varying max_tokens to introduce scheduling randomness.
max_tokens = [10 * i for i in range(1, len(prompts) + 1)] max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
assert len(prompts) == len(max_tokens) assert len(prompts) == len(max_tokens)
test_args = [ test_args = [
"--tensor-parallel-size", "--tensor-parallel-size",
str(tp_size), "--pipeline-parallel-size", str(tp_size), "--pipeline-parallel-size",
str(pp_size), "--max-num-seqs", str(pp_size), "--max-num-seqs",
str(max_num_seqs) str(max_num_seqs)
] ]
server_args = DEFAULT_SERVER_ARGS + test_args server_args = DEFAULT_SERVER_ARGS + test_args
ms_server_args = DEFAULT_SERVER_ARGS + \ ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
test_args test_args
# Spin up client/server & issue completion API requests. # Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically # Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to # was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI # observed timeouts in GHA CI
ref_completions = await completions_with_server_args( ref_completions = await completions_with_server_args(
prompts=prompts, prompts=prompts,
model_name=model, model_name=model,
server_cli_args=server_args, server_cli_args=server_args,
num_logprobs=None, num_logprobs=None,
max_wait_seconds=5 * 240, max_wait_seconds=5 * 240,
max_tokens=max_tokens) max_tokens=max_tokens)
test_completions = await completions_with_server_args( test_completions = await completions_with_server_args(
prompts=prompts, prompts=prompts,
model_name=model, model_name=model,
server_cli_args=ms_server_args, server_cli_args=ms_server_args,
num_logprobs=None, num_logprobs=None,
max_wait_seconds=5 * 240, max_wait_seconds=5 * 240,
max_tokens=max_tokens) max_tokens=max_tokens)
# Assert multi-step scheduling produces identical tokens # Assert multi-step scheduling produces identical tokens
# to single-step scheduling. # to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions) ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions) test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations assert ref_generations == test_generations

View File

@ -7,7 +7,7 @@ from typing import Optional
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
@ -42,7 +42,7 @@ def test_multi_step_llm(
num_prompts: int, num_prompts: int,
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine. """Test vLLM engine with multi-step scheduling via sync LLM Engine.
@ -70,48 +70,49 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned. completions endpoint; `None` -> 1 logprob returned.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1) prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts] prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts assert len(prompts) == num_prompts
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
) as vllm_model: ) as vllm_model:
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else if num_logprobs is None else
vllm_model.generate_greedy_logprobs( vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)) prompts, max_tokens, num_logprobs))
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else if num_logprobs is None else
hf_model.generate_greedy_logprobs_limit( hf_model.generate_greedy_logprobs_limit(
prompts, max_tokens, num_logprobs)) prompts, max_tokens, num_logprobs))
if num_logprobs is None: if num_logprobs is None:
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
else: else:
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
num_logprobs: Optional[int], num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int], num_prompt_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine. """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
note that this argument is not supported by the note that this argument is not supported by the
OpenAI completions endpoint. OpenAI completions endpoint.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1) prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts] prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts assert len(prompts) == num_prompts
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts, prompts,
max_tokens, max_tokens,
num_logprobs, num_logprobs,
num_prompt_logprobs=num_prompt_logprobs) num_prompt_logprobs=num_prompt_logprobs)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
) as vllm_model: ) as vllm_model:
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts, prompts,
max_tokens, max_tokens,
num_logprobs, num_logprobs,
num_prompt_logprobs=num_prompt_logprobs) num_prompt_logprobs=num_prompt_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=single_step_vllm_outputs, outputs_0_lst=single_step_vllm_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
num_prompts: int, num_prompts: int,
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC. """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
# #
# The Incorrect scheduling behavior - if it occurs - will cause an exception # The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`. # in the model runner resulting from `do_sample=False`.
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
assert len(example_prompts) >= 2 assert len(example_prompts) >= 2
challenge_prompts = copy.deepcopy(example_prompts) challenge_prompts = copy.deepcopy(example_prompts)
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' challenge_prompts[0] = (
'inference and serving engine for LLMs.\n' 'vLLM is a high-throughput and memory-efficient '
) # 24 tok 'inference and serving engine for LLMs.\n') # 24 tok
challenge_prompts[1] = ( challenge_prompts[1] = (
'Briefly describe the major milestones in the ' 'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.\n' 'development of artificial intelligence from 1950 to 2020.\n'
) # 30 tok ) # 30 tok
# If necessary, adjust the length of `challenge_prompts` to match # If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts` # `num_prompts`
if len(challenge_prompts) < num_prompts: if len(challenge_prompts) < num_prompts:
challenge_prompts = (challenge_prompts * challenge_prompts = (challenge_prompts *
((num_prompts // len(challenge_prompts)) + 1)) ((num_prompts // len(challenge_prompts)) + 1))
challenge_prompts = challenge_prompts[:num_prompts] challenge_prompts = challenge_prompts[:num_prompts]
assert len(challenge_prompts) == num_prompts assert len(challenge_prompts) == num_prompts
# Single-step scheduler baseline # Single-step scheduler baseline
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
max_model_len=48, max_model_len=48,
max_num_batched_tokens=48, max_num_batched_tokens=48,
max_num_seqs=4, max_num_seqs=4,
block_size=16, block_size=16,
) as vllm_model: ) as vllm_model:
outputs_baseline = (vllm_model.generate_greedy( outputs_baseline = (
challenge_prompts, max_tokens) if num_logprobs is None else vllm_model.generate_greedy(challenge_prompts, max_tokens) if
vllm_model.generate_greedy_logprobs( num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs)) challenge_prompts, max_tokens, num_logprobs))
# multi-step+"single-step chunked prefill"+APC # multi-step+"single-step chunked prefill"+APC
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
enable_chunked_prefill=True, enable_chunked_prefill=True,
enable_prefix_caching=True, enable_prefix_caching=True,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
max_model_len=48, max_model_len=48,
max_num_batched_tokens=48, max_num_batched_tokens=48,
max_num_seqs=4, max_num_seqs=4,
block_size=16, block_size=16,
) as vllm_model: ) as vllm_model:
outputs_w_features = (vllm_model.generate_greedy( outputs_w_features = (
challenge_prompts, max_tokens) if num_logprobs is None else vllm_model.generate_greedy(challenge_prompts, max_tokens) if
vllm_model.generate_greedy_logprobs( num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs)) challenge_prompts, max_tokens, num_logprobs))
if num_logprobs is None: if num_logprobs is None:
# No-logprobs test # No-logprobs test
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs_baseline, outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features, outputs_1_lst=outputs_w_features,
name_0="multi-step", name_0="multi-step",
name_1="multi-step+features", name_1="multi-step+features",
) )
else: else:
# Yes-logprobs test # Yes-logprobs test
check_logprobs_close( check_logprobs_close(
outputs_0_lst=outputs_baseline, outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features, outputs_1_lst=outputs_w_features,
name_0="multi-step", name_0="multi-step",
name_1="multi-step+features", name_1="multi-step+features",
) )

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import neuronxcc.nki.language as nl import neuronxcc.nki.language as nl
import pytest import pytest
@ -99,6 +98,7 @@ def ref_block_tables_transform(
) )
@torch.inference_mode() @torch.inference_mode()
def test_load_and_transform_block_tables( def test_load_and_transform_block_tables(
monkeypatch: pytest.MonkeyPatch,
num_tiles, num_tiles,
num_blocks_per_tile, num_blocks_per_tile,
q_head_per_kv_head, q_head_per_kv_head,
@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
device = xm.xla_device() device = xm.xla_device()
compiler_flags = [ compiler_flags_str = " ".join([
"-O1", "-O1",
"--retry_failed_compilation", "--retry_failed_compilation",
] ])
compiler_flags_str = " ".join(compiler_flags) with monkeypatch.context() as m:
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(10000) torch.manual_seed(10000)
torch.set_printoptions(sci_mode=False) torch.set_printoptions(sci_mode=False)
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
B_P_SIZE = 128 B_P_SIZE = 128
if num_blocks_per_tile < B_P_SIZE: if num_blocks_per_tile < B_P_SIZE:
assert B_P_SIZE % num_blocks_per_tile == 0 assert B_P_SIZE % num_blocks_per_tile == 0
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
else: else:
block_size_tiling_factor = 1 block_size_tiling_factor = 1
max_num_blocks = 100000 max_num_blocks = 100000
block_tables = torch.randint( block_tables = torch.randint(
0, 0,
max_num_blocks, max_num_blocks,
(num_tiles * num_blocks_per_tile, ), (num_tiles * num_blocks_per_tile, ),
dtype=torch.int32, dtype=torch.int32,
) )
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
block_tables.to(device=device), block_tables.to(device=device),
num_tiles, num_tiles,
num_blocks_per_tile, num_blocks_per_tile,
q_head_per_kv_head, q_head_per_kv_head,
head_id, head_id,
block_size_tiling_factor, block_size_tiling_factor,
).cpu() ).cpu()
ref_out = ref_block_tables_transform( ref_out = ref_block_tables_transform(
block_tables, block_tables,
num_tiles, num_tiles,
num_blocks_per_tile, num_blocks_per_tile,
q_head_per_kv_head, q_head_per_kv_head,
head_id, head_id,
block_size_tiling_factor, block_size_tiling_factor,
) )
assert (nki_out.shape == ref_out.shape assert (nki_out.shape == ref_out.shape
), f"{nki_out.shape=} != {ref_out.shape=}" ), f"{nki_out.shape=} != {ref_out.shape=}"
assert torch.all(nki_out == ref_out) assert torch.all(nki_out == ref_out)

View File

@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
]) ])
@torch.inference_mode() @torch.inference_mode()
def test_contexted_kv_attention( def test_contexted_kv_attention(
monkeypatch: pytest.MonkeyPatch,
prefill_batch_size: int, prefill_batch_size: int,
decode_batch_size: int, decode_batch_size: int,
num_heads: int, num_heads: int,
@ -329,7 +330,6 @@ def test_contexted_kv_attention(
large_tile_size, large_tile_size,
mixed_precision: bool, mixed_precision: bool,
) -> None: ) -> None:
import os
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
@ -340,174 +340,178 @@ def test_contexted_kv_attention(
device = xm.xla_device() device = xm.xla_device()
compiler_flags = [ compiler_flags_str = " ".join([
"-O1", "-O1",
"--retry_failed_compilation", "--retry_failed_compilation",
] ])
compiler_flags_str = " ".join(compiler_flags) with monkeypatch.context() as m:
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(0) torch.manual_seed(0)
torch.set_printoptions(sci_mode=False) torch.set_printoptions(sci_mode=False)
torch.set_default_device("cpu") torch.set_default_device("cpu")
dtype = torch.float32 dtype = torch.float32
min_ctx_len = 32 min_ctx_len = 32
max_ctx_len = 1024 max_ctx_len = 1024
min_query_len = 16 min_query_len = 16
max_query_len = 512 max_query_len = 512
num_kv_heads = num_heads // num_queries_per_kv num_kv_heads = num_heads // num_queries_per_kv
( (
query, query,
k_active, k_active,
v_active, v_active,
k_cache, k_cache,
v_cache, v_cache,
block_table, block_table,
key, key,
value, value,
query_lens, query_lens,
seq_lens, seq_lens,
) = sample_inputs( ) = sample_inputs(
prefill_batch_size=prefill_batch_size, prefill_batch_size=prefill_batch_size,
decode_batch_size=decode_batch_size, decode_batch_size=decode_batch_size,
min_query_len=min_query_len, min_query_len=min_query_len,
max_query_len=max_query_len, max_query_len=max_query_len,
min_ctx_len=min_ctx_len, min_ctx_len=min_ctx_len,
max_ctx_len=max_ctx_len, max_ctx_len=max_ctx_len,
block_size=block_size, block_size=block_size,
num_heads=num_heads, num_heads=num_heads,
num_kv_heads=num_kv_heads, num_kv_heads=num_kv_heads,
head_size=head_size, head_size=head_size,
dtype=dtype, dtype=dtype,
) )
output_ref = ref_context_attention( output_ref = ref_context_attention(
query, query,
key, key,
value, value,
query_lens, query_lens,
seq_lens, seq_lens,
head_size, head_size,
num_queries_per_kv, num_queries_per_kv,
return_max_reduce=False, return_max_reduce=False,
) )
# build neuron program # build neuron program
B_P_SIZE = 128 B_P_SIZE = 128
assert (large_tile_size >= B_P_SIZE assert (large_tile_size >= B_P_SIZE
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
def ceil_div(a, b): def ceil_div(a, b):
return (a + b - 1) // b return (a + b - 1) // b
def pad_to_multiple(a, b): def pad_to_multiple(a, b):
return ceil_div(a, b) * b return ceil_div(a, b) * b
def pad_to_next_power_of_2(a): def pad_to_next_power_of_2(a):
assert a > 0 assert a > 0
return 2**int(a - 1).bit_length() return 2**int(a - 1).bit_length()
# calculate input shapes # calculate input shapes
max_num_queries = pad_to_next_power_of_2(sum(query_lens)) max_num_queries = pad_to_next_power_of_2(sum(query_lens))
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
num_active_blocks = ceil_div(context_lens, block_size).sum().item() num_active_blocks = ceil_div(context_lens, block_size).sum().item()
num_active_blocks = pad_to_multiple(num_active_blocks, num_active_blocks = pad_to_multiple(num_active_blocks,
large_tile_size // block_size) large_tile_size // block_size)
context_kv_len = num_active_blocks * block_size context_kv_len = num_active_blocks * block_size
assert (context_kv_len % assert (
context_kv_len %
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
# pad QKV tensors # pad QKV tensors
pad_dims = ( pad_dims = (
0,
0,
0,
0,
0,
max_num_queries - query.shape[0],
)
query = F.pad(query, pad_dims, "constant", 0)
k = F.pad(k_active, pad_dims, "constant", 0)
v = F.pad(v_active, pad_dims, "constant", 0)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
# transform block table
active_block_table = get_active_block_tables(
block_table.cpu(),
torch.tensor(query_lens).cpu(),
torch.tensor(seq_lens).cpu(),
block_size,
num_active_blocks,
)
# Build attention masks
prior_mask, active_mask = (
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens, block_size=block_size))
prior_mask_padded = F.pad(
prior_mask,
(
0, 0,
context_kv_len - prior_mask.shape[1],
0, 0,
max_num_queries - prior_mask.shape[0],
),
"constant",
0,
).bool()
active_mask_padded = F.pad(
active_mask,
(
0, 0,
max_num_queries - active_mask.shape[1],
0, 0,
max_num_queries - active_mask.shape[0], 0,
), max_num_queries - query.shape[0],
"constant", )
0, query = F.pad(query, pad_dims, "constant", 0)
).bool() k = F.pad(k_active, pad_dims, "constant", 0)
attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1) v = F.pad(v_active, pad_dims, "constant", 0)
attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size) # permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
input_args = ( # transform block table
query.to(device=device), active_block_table = get_active_block_tables(
k.to(device=device), block_table.cpu(),
v.to(device=device), torch.tensor(query_lens).cpu(),
k_cache.to(device=device), torch.tensor(seq_lens).cpu(),
v_cache.to(device=device), block_size,
active_block_table.to(device=device), num_active_blocks,
attn_mask.to(device=device), )
)
input_kwargs = dict(
n_kv_head=num_kv_heads,
head_size=head_size,
mixed_precision=mixed_precision,
LARGE_TILE_SZ=large_tile_size,
)
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) # Build attention masks
prior_mask, active_mask = (
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens, block_size=block_size))
prior_mask_padded = F.pad(
prior_mask,
(
0,
context_kv_len - prior_mask.shape[1],
0,
max_num_queries - prior_mask.shape[0],
),
"constant",
0,
).bool()
active_mask_padded = F.pad(
active_mask,
(
0,
max_num_queries - active_mask.shape[1],
0,
max_num_queries - active_mask.shape[0],
),
"constant",
0,
).bool()
attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
dim=1)
num_actual_tokens = sum(query_lens) attn_mask = reorder_context_mask(attn_mask, large_tile_size,
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) block_size)
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
output_nki = output_nki[0, :num_actual_tokens, :, :]
output_ref_padded = F.pad(
output_ref,
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
"constant",
0,
)
output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) input_args = (
query.to(device=device),
k.to(device=device),
v.to(device=device),
k_cache.to(device=device),
v_cache.to(device=device),
active_block_table.to(device=device),
attn_mask.to(device=device),
)
input_kwargs = dict(
n_kv_head=num_kv_heads,
head_size=head_size,
mixed_precision=mixed_precision,
LARGE_TILE_SZ=large_tile_size,
)
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
num_actual_tokens = sum(query_lens)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
output_nki = output_nki[0, :num_actual_tokens, :, :]
output_ref_padded = F.pad(
output_ref,
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
"constant",
0,
)
output_ref = output_ref_padded.transpose(
0, 1)[0, :num_actual_tokens, :, :]
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)

View File

@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import get_attn_backend from vllm.attention.selector import get_attn_backend
from vllm.utils import STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
def test_platform_plugins(): def test_platform_plugins():
@ -25,8 +25,9 @@ def test_platform_plugins():
f" is loaded. The first import:\n{_init_trace}") f" is loaded. The first import:\n{_init_trace}")
def test_oot_attention_backend(monkeypatch): def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set # ignore the backend env variable if it is set
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with monkeypatch.context() as m:
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
assert backend.get_name() == "Dummy_Backend" backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "Dummy_Backend"

View File

@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
raise Exception("Exception raised by DummyV1Scheduler") raise Exception("Exception raised by DummyV1Scheduler")
def test_scheduler_plugins_v0(monkeypatch): def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
with pytest.raises(Exception) as exception_info: m.setenv("VLLM_USE_V1", "0")
with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs( engine_args = EngineArgs(
model="facebook/opt-125m", model="facebook/opt-125m",
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
scheduler_cls=DummyV0Scheduler, scheduler_cls=DummyV0Scheduler,
) )
engine = LLMEngine.from_engine_args(engine_args=engine_args) engine = LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params) engine.add_request("0", "foo", sampling_params)
engine.step() engine.step()
assert str(exception_info.value) == "Exception raised by DummyV0Scheduler" assert str(
exception_info.value) == "Exception raised by DummyV0Scheduler"
def test_scheduler_plugins_v1(monkeypatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1") with monkeypatch.context() as m:
# Explicitly turn off engine multiprocessing so that the scheduler runs in m.setenv("VLLM_USE_V1", "1")
# this process # Explicitly turn off engine multiprocessing so
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with pytest.raises(Exception) as exception_info: with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs( engine_args = EngineArgs(
model="facebook/opt-125m", model="facebook/opt-125m",
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
scheduler_cls=DummyV1Scheduler, scheduler_cls=DummyV1Scheduler,
) )
engine = V1LLMEngine.from_engine_args(engine_args=engine_args) engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params) engine.add_request("0", "foo", sampling_params)
engine.step() engine.step()
assert str(exception_info.value) == "Exception raised by DummyV1Scheduler" assert str(
exception_info.value) == "Exception raised by DummyV1Scheduler"

View File

@ -4,25 +4,29 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`. Run `pytest tests/prefix_caching/test_prefix_caching.py`.
""" """
from __future__ import annotations
import pytest import pytest
from tests.conftest import VllmRunner from tests.conftest import VllmRunner
from tests.core.utils import SchedulerProxy, create_dummy_prompt from tests.core.utils import SchedulerProxy, create_dummy_prompt
from tests.kernels.utils import override_backend_env_variable
from vllm import SamplingParams, TokensPrompt from vllm import SamplingParams, TokensPrompt
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
This module relies on V0 internals, so set VLLM_USE_V1=0. This module relies on V0 internals, so set VLLM_USE_V1=0.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
MODELS = [ MODELS = [
@ -56,7 +60,7 @@ def test_mixed_requests(
cached_position: int, cached_position: int,
enable_chunked_prefill: bool, enable_chunked_prefill: bool,
block_size: int, block_size: int,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Test the case when some sequences have the prefix cache hit Test the case when some sequences have the prefix cache hit
@ -67,72 +71,77 @@ def test_mixed_requests(
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
cached_prompt = example_prompts[cached_position] cached_prompt = example_prompts[cached_position]
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
enable_prefix_caching=True, enable_prefix_caching=True,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
block_size=block_size, block_size=block_size,
) as vllm_model: ) as vllm_model:
# Run the first prompt so the cache is populated # Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) vllm_outputs = vllm_model.generate_greedy([cached_prompt],
max_tokens)
# Run all the promopts # Run all the promopts
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0,
req_outputs = vllm_model.model.generate(example_prompts, greedy_params) max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts,
greedy_params)
# Verify number of cached tokens # Verify number of cached tokens
for i in range(len(req_outputs)): for i in range(len(req_outputs)):
if i == cached_position: if i == cached_position:
expected_num_cached_tokens = ( expected_num_cached_tokens = (
len(req_outputs[i].prompt_token_ids) // len(req_outputs[i].prompt_token_ids) //
block_size) * block_size block_size) * block_size
else: else:
expected_num_cached_tokens = 0 expected_num_cached_tokens = 0
assert ( assert (req_outputs[i].num_cached_tokens ==
req_outputs[i].num_cached_tokens == expected_num_cached_tokens) expected_num_cached_tokens)
vllm_outputs = [( vllm_outputs = [(
output.prompt_token_ids + list(output.outputs[0].token_ids), output.prompt_token_ids + list(output.outputs[0].token_ids),
output.prompt + output.outputs[0].text, output.prompt + output.outputs[0].text,
) for output in req_outputs] ) for output in req_outputs]
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
def test_unstable_prompt_sequence( def test_unstable_prompt_sequence(
vllm_runner, vllm_runner,
backend: str, backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
if backend == "FLASHINFER" and current_platform.is_rocm(): if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with vllm_runner( with vllm_runner(
"Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct",
enable_chunked_prefill=True, enable_chunked_prefill=True,
enable_prefix_caching=True, enable_prefix_caching=True,
max_model_len=4096, max_model_len=4096,
) as vllm_model: ) as vllm_model:
for prompt in UNSTABLE_PROMPT_SEQUENCE: for prompt in UNSTABLE_PROMPT_SEQUENCE:
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
SamplingParams(max_tokens=1)) SamplingParams(max_tokens=1))
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)

View File

@ -56,12 +56,11 @@ def test_gc():
assert allocated < 50 * 1024 * 1024 assert allocated < 50 * 1024 * 1024
def test_model_from_modelscope(monkeypatch): def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" with monkeypatch.context() as m:
monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") m.setenv("VLLM_USE_MODELSCOPE", "True")
try: llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
llm = LLM(model=MODELSCOPE_MODEL_NAME)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
assert len(outputs) == 4 assert len(outputs) == 4
finally:
monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# ruff: noqa
import asyncio import asyncio
import os
import socket import socket
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from unittest.mock import patch from unittest.mock import patch
@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
dummy(old_arg=1) dummy(old_arg=1)
def test_get_open_port(): def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_PORT"] = "5678" with monkeypatch.context() as m:
# make sure we can get multiple ports, even if the env var is set m.setenv("VLLM_PORT", "5678")
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: # make sure we can get multiple ports, even if the env var is set
s1.bind(("localhost", get_open_port())) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: s1.bind(("localhost", get_open_port()))
s2.bind(("localhost", get_open_port())) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: s2.bind(("localhost", get_open_port()))
s3.bind(("localhost", get_open_port())) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
os.environ.pop("VLLM_PORT") s3.bind(("localhost", get_open_port()))
# Tests for FlexibleArgumentParser # Tests for FlexibleArgumentParser
@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder(monkeypatch): def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet. # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType
# example from bart # example from bart
ctx = { ctx = {
'encoder.layers.0.self_attn.attn': 'encoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
'decoder.layers.0.encoder_attn.attn': 'decoder.layers.0.encoder_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
'decoder.layers.0.self_attn.attn': 'decoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
} }
kv_cache = [ kv_cache = [
torch.zeros((1, )), torch.zeros((1, )),
] ]
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
bind_kv_cache(ctx, [kv_cache]) bind_kv_cache(ctx, [kv_cache])
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_pp(): def test_bind_kv_cache_pp():

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
@ -9,16 +9,17 @@ from ..utils import compare_two_settings
# --enforce-eager on TPU causes graph compilation # --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine, # this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s # so we set the timeout here to 30s
os.environ["VLLM_RPC_TIMEOUT"] = "30000"
def test_custom_dispatcher(): def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
compare_two_settings( with monkeypatch.context() as m:
"google/gemma-2b", m.setenv("VLLM_RPC_TIMEOUT", "30000")
arg1=[ compare_two_settings(
"--enforce-eager", "google/gemma-2b",
f"-O{CompilationLevel.DYNAMO_ONCE}", arg1=[
], "--enforce-eager",
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], f"-O{CompilationLevel.DYNAMO_ONCE}",
env1={}, ],
env2={}) arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
env1={},
env2={})

View File

@ -1,10 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# ruff: noqa
# type: ignore
from __future__ import annotations
import os
import threading import threading
from collections.abc import Iterable from collections.abc import Iterable
from concurrent import futures from concurrent import futures
from typing import Callable, Literal from typing import Callable, Generator, Literal
import grpc import grpc
import pytest import pytest
@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module. all tests in the module.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
@pytest.fixture @pytest.fixture
def trace_service(): def trace_service() -> Generator[FakeTraceService, None, None]:
"""Fixture to set up a fake gRPC trace service""" """Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService() service = FakeTraceService()
@ -80,136 +84,153 @@ def trace_service():
server.stop(None) server.stop(None)
def test_traces(trace_service): def test_traces(
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(
top_p=0.1, temperature=0.01,
max_tokens=256) top_p=0.1,
model = "facebook/opt-125m" max_tokens=256,
llm = LLM( )
model=model, model = "facebook/opt-125m"
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, llm = LLM(
) model=model,
prompts = ["This is a short prompt"] otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
outputs = llm.generate(prompts, sampling_params=sampling_params) )
prompts = ["This is a short prompt"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
timeout = 5 timeout = 5
if not trace_service.evt.wait(timeout): if not trace_service.evt.wait(timeout):
raise TimeoutError( raise TimeoutError(
f"The fake trace service didn't receive a trace within " f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout") f"the {timeout} seconds timeout")
request = trace_service.request request = trace_service.request
assert len(request.resource_spans) == 1, ( assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, " f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}") f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, ( assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, " f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}") f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, " f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
attributes = decode_attributes( attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes) request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature ) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens ) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
outputs[0].prompt_token_ids) assert attributes.get(
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
assert attributes.get( outputs[0].prompt_token_ids)
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
metrics = outputs[0].metrics assert attributes.get(
assert attributes.get( SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue metrics = outputs[0].metrics
ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
assert attributes.get( ) == metrics.time_in_queue
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft ttft = metrics.first_token_time - metrics.arrival_time
e2e_time = metrics.finished_time - metrics.arrival_time assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
assert metrics.scheduler_time > 0 e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
) == metrics.scheduler_time assert metrics.scheduler_time > 0
# Model forward and model execute should be none, since detailed traces is assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
# not enabled. ) == metrics.scheduler_time
assert metrics.model_forward_time is None # Model forward and model execute should be none, since detailed traces is
assert metrics.model_execute_time is None # not enabled.
assert metrics.model_forward_time is None
assert metrics.model_execute_time is None
def test_traces_with_detailed_steps(trace_service): def test_traces_with_detailed_steps(
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(
top_p=0.1, temperature=0.01,
max_tokens=256) top_p=0.1,
model = "facebook/opt-125m" max_tokens=256,
llm = LLM( )
model=model, model = "facebook/opt-125m"
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, llm = LLM(
collect_detailed_traces="all", model=model,
) otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
prompts = ["This is a short prompt"] collect_detailed_traces="all",
outputs = llm.generate(prompts, sampling_params=sampling_params) )
prompts = ["This is a short prompt"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
timeout = 5 timeout = 5
if not trace_service.evt.wait(timeout): if not trace_service.evt.wait(timeout):
raise TimeoutError( raise TimeoutError(
f"The fake trace service didn't receive a trace within " f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout") f"the {timeout} seconds timeout")
request = trace_service.request request = trace_service.request
assert len(request.resource_spans) == 1, ( assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, " f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}") f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, ( assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, " f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}") f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, " f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
attributes = decode_attributes( attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes) request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature ) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens ) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
outputs[0].prompt_token_ids) assert attributes.get(
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
assert attributes.get( outputs[0].prompt_token_ids)
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
metrics = outputs[0].metrics assert attributes.get(
assert attributes.get( SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue metrics = outputs[0].metrics
ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
assert attributes.get( ) == metrics.time_in_queue
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft ttft = metrics.first_token_time - metrics.arrival_time
e2e_time = metrics.finished_time - metrics.arrival_time assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
assert metrics.scheduler_time > 0 e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
) == metrics.scheduler_time assert metrics.scheduler_time > 0
assert metrics.model_forward_time > 0 assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
assert attributes.get( ) == metrics.scheduler_time
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( assert metrics.model_forward_time > 0
metrics.model_forward_time / 1000) assert attributes.get(
assert metrics.model_execute_time > 0 SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE ) == pytest.approx(metrics.model_forward_time / 1000)
) == metrics.model_execute_time assert metrics.model_execute_time > 0
assert metrics.model_forward_time < 1000 * metrics.model_execute_time assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
) == metrics.model_execute_time
assert metrics.model_forward_time < 1000 * metrics.model_execute_time

View File

@ -566,6 +566,7 @@ def init_test_distributed_environment(
def multi_process_parallel( def multi_process_parallel(
monkeypatch: pytest.MonkeyPatch,
tp_size: int, tp_size: int,
pp_size: int, pp_size: int,
test_target: Any, test_target: Any,
@ -582,7 +583,13 @@ def multi_process_parallel(
refs = [] refs = []
for rank in range(tp_size * pp_size): for rank in range(tp_size * pp_size):
refs.append( refs.append(
test_target.remote(tp_size, pp_size, rank, distributed_init_port)) test_target.remote(
monkeypatch,
tp_size,
pp_size,
rank,
distributed_init_port,
), )
ray.get(refs) ray.get(refs)
ray.shutdown() ray.shutdown()

View File

@ -1,5 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import random import random
from typing import Any
import pytest import pytest
@ -50,8 +53,12 @@ def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct" return "meta-llama/Meta-Llama-3-8B-Instruct"
def test_ngram_correctness(monkeypatch, test_prompts, sampling_config, def test_ngram_correctness(
model_name): monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
):
''' '''
Compare the outputs of a original LLM and a speculative LLM Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding. should be the same when using ngram speculative decoding.

View File

@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT), [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)]) (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load(monkeypatch, output_kind: RequestOutputKind, async def test_load(
engine_args_and_prompt: tuple[AsyncEngineArgs, monkeypatch: pytest.MonkeyPatch,
PromptType]): output_kind: RequestOutputKind,
engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the # so that in the future when we switch, we don't have to change all the
# tests. # tests.
@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT), [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)]) (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort(monkeypatch, output_kind: RequestOutputKind, async def test_abort(monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args_and_prompt: tuple[AsyncEngineArgs, engine_args_and_prompt: tuple[AsyncEngineArgs,
PromptType]): PromptType]):

View File

@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core(monkeypatch): def test_engine_core(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
@ -159,7 +159,7 @@ def test_engine_core(monkeypatch):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core_advanced_sampling(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
""" """
A basic end-to-end test to verify that the engine functions correctly A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and when additional sampling parameters, such as top_p, min_tokens, and
@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core_concurrent_batches(monkeypatch): def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
""" """
Test that the engine can handle multiple concurrent batches. Test that the engine can handle multiple concurrent batches.
""" """

View File

@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
@fork_new_process_for_each_test @fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False]) @pytest.mark.parametrize("multiprocessing_mode", [True, False])
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")

View File

@ -255,12 +255,10 @@ def _run_and_validate(
[NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]) [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
@pytest.mark.parametrize("temperature", [0.0, 2.0]) @pytest.mark.parametrize("temperature", [0.0, 2.0])
def test_get_logprobs_and_prompt_logprobs( def test_get_logprobs_and_prompt_logprobs(
hf_model, hf_model, vllm_model,
vllm_model, batch_logprobs_composition: BatchLogprobsComposition,
batch_logprobs_composition: BatchLogprobsComposition, temperature: float, example_prompts: list[str],
temperature: float, monkeypatch: pytest.MonkeyPatch) -> None:
example_prompts,
) -> None:
"""Test V1 Engine logprobs & prompt logprobs """Test V1 Engine logprobs & prompt logprobs
Exercise a variety of combinations of `logprobs` and `prompt_logprobs` Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter temperature: "temperature" sampling parameter
example_prompts: example prompt fixture example_prompts: example prompt fixture
""" """
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching with monkeypatch.context() as m:
if do_apc and (temperature < 2.0 m.setenv("VLLM_USE_V1", "1")
or batch_logprobs_composition != SAMPLE_PROMPT): do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
# Skip some test-cases to save time. if do_apc and (temperature < 2.0
pytest.skip() or batch_logprobs_composition != SAMPLE_PROMPT):
test_prompts = example_prompts # Skip some test-cases to save time.
pytest.skip()
test_prompts = example_prompts
max_tokens = 5 max_tokens = 5
hf_outputs = hf_model.generate_greedy( hf_outputs = hf_model.generate_greedy(
test_prompts, test_prompts,
max_tokens=max_tokens,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens, max_tokens=max_tokens,
do_apc=do_apc) )
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(
batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc)
def test_max_logprobs(): def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs` """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation. APC should not matter as this test checks basic request validation.
Args:
monkeypatch
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner("facebook/opt-125m", runner = VllmRunner("facebook/opt-125m",
max_logprobs=1, max_logprobs=1,
enable_prefix_caching=False, enable_prefix_caching=False,
max_model_len=256) max_model_len=256)
vllm_sampling_params = SamplingParams(logprobs=1) vllm_sampling_params = SamplingParams(logprobs=1)
# should pass # should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params) runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2) bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params) runner.generate(["Hello world"],
sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts): def test_none_logprobs(vllm_model, example_prompts,
monkeypatch: pytest.MonkeyPatch):
"""Engine should return `logprobs` and `prompt_logprobs` as `None` """Engine should return `logprobs` and `prompt_logprobs` as `None`
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
max_tokens = 5 with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, sampling_params_logprobs_none = SamplingParams(
logprobs=None, max_tokens=max_tokens,
prompt_logprobs=None, logprobs=None,
temperature=0.0) prompt_logprobs=None,
results_logprobs_none = vllm_model.model.generate( temperature=0.0,
example_prompts, sampling_params=sampling_params_logprobs_none) )
results_logprobs_none = vllm_model.model.generate(
example_prompts,
sampling_params=sampling_params_logprobs_none,
)
for i in range(len(results_logprobs_none)): for i in range(len(results_logprobs_none)):
# Check sample logprobs are None # Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None assert results_logprobs_none[i].outputs[
# Check prompt logprobs are None 0].cumulative_logprob is None
assert results_logprobs_none[i].prompt_logprobs is None # Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts): def test_zero_logprobs(vllm_model, example_prompts,
monkeypatch: pytest.MonkeyPatch):
"""Engine should return sampled token and prompt token logprobs """Engine should return sampled token and prompt token logprobs
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
max_tokens = 5 with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
logprobs=0, logprobs=0,
prompt_logprobs=0, prompt_logprobs=0,
temperature=0.0) temperature=0.0)
results_logprobs_zero = vllm_model.model.generate( results_logprobs_zero = vllm_model.model.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero) example_prompts, sampling_params=sampling_params_logprobs_zero)
for i in range(len(results_logprobs_zero)): for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each # Check that there is one sample logprob dict for each
# sample token # sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs) assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[ assert results_logprobs_zero[i].outputs[
0].cumulative_logprob is not None 0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each # Check that there is one prompt logprob dict for each
# prompt token # prompt token
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs) assert len(prompt_token_ids) == len(prompt_logprobs)

View File

@ -3,11 +3,16 @@
Run `pytest tests/v1/tpu/test_basic.py`. Run `pytest tests/v1/tpu/test_basic.py`.
""" """
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...conftest import VllmRunner if TYPE_CHECKING:
from tests.conftest import VllmRunner
MODELS = [ MODELS = [
# "Qwen/Qwen2-7B-Instruct", # "Qwen/Qwen2-7B-Instruct",
@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES) @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
def test_models( def test_models(
monkeypatch, vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str, model: str,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
@ -41,7 +47,7 @@ def test_models(
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
with VllmRunner( with vllm_runner(
model, model,
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
@ -50,5 +56,5 @@ def test_models(
tensor_parallel_size=tensor_parallel_size) as vllm_model: tensor_parallel_size=tensor_parallel_size) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens) max_tokens)
output = vllm_outputs[0][1] output = vllm_outputs[0][1]
assert "1024" in output assert "1024" in output