[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Sibi 2025-03-17 11:35:57 +08:00 committed by GitHub
parent 1e799b7ec1
commit a73e183e36
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 1900 additions and 1658 deletions

View File

@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"

View File

@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
@ -63,7 +64,8 @@ def test_models(
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
@ -80,7 +82,8 @@ def test_models(
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) ])
def test_models_distributed( def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
@ -116,13 +120,17 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE: if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") pytest.skip(f"Skip test for {test_suite}")
with monkeypatch.context() as monkeypatch_context:
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend: if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
@ -130,13 +138,16 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method
with vllm_runner(model, # (the default method).
with vllm_runner(
model,
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

View File

@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file. all tests in the file.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -52,13 +60,14 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
@ -75,7 +84,8 @@ def test_models(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
@ -90,21 +100,21 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test Ray Compiled Graph # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
@ -119,7 +129,8 @@ def test_models_distributed(
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with
# fork method (the default method).
with vllm_runner( with vllm_runner(
model, model,
@ -130,7 +141,10 @@ def test_models_distributed(
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True]) @pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache( def test_models_with_fp8_kv_cache(
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
kv_cache_dtype: str, kv_cache_dtype: str,
model: str, model: str,
@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt], outputs[enable] += vllm_model.generate_greedy(
max_tokens) [prompt],
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner, hf_runner: HfRunner,
vllm_runner, vllm_runner: VllmRunner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner, vllm_runner: VllmRunner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,

View File

@ -123,9 +123,9 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
import os with monkeypatch.context() as m:
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True) llm = LLM(model, enable_sleep_mode=True)
@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool):
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Optional
import pytest import pytest
@ -22,8 +22,11 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing # we cannot afford testing the full Catesian product
test_settings = [ # of all models and all levels
@pytest.mark.parametrize(
"test_setting",
[
# basic llama model # basic llama model
TestSetting( TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct", model="meta-llama/Llama-3.2-1B-Instruct",
@ -84,13 +87,11 @@ test_settings = [
method="generate_with_image", method="generate_with_image",
fullgraph=False, fullgraph=False,
), ),
] ])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
# we cannot afford testing the full Catesian product test_setting: TestSetting,
# of all models and all levels ):
@pytest.mark.parametrize("test_setting", test_settings)
def test_compile_correctness(test_setting: TestSetting):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend with monkeypatch.context() as m:
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
["-tp", str(tp_size)] final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = [] all_args: list[list[str]] = []
all_envs: list[Optional[dict[str, str]]] = [] all_envs: list[dict[str, str] | None] = []
for level in [ for level in [
CompilationLevel.NO_COMPILATION, CompilationLevel.NO_COMPILATION,

View File

@ -1,22 +1,115 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest from __future__ import annotations
from typing import Any
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
)
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level): def test_full_graph(
model = model_info[0] monkeypatch: pytest.MonkeyPatch,
model_kwargs = model_info[1] model_info: tuple[str, dict[str, Any]],
check_full_graph_support(model, optimization_level: int,
model_kwargs, ):
optimization_level, model, model_kwargs = model_info
tp_size=1)
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -1,93 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
from __future__ import annotations
from typing import Any, Callable
import pytest import pytest
import ray import ray
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, def all_reduce_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, def all_gather_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def broadcast_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_tensor_dict_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_test_worker(
distributed_init_port: str): monkeypatch: pytest.MonkeyPatch,
os.environ.pop("CUDA_VISIBLE_DEVICES", None) tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel(tp_size, test_target): def test_multi_process_tensor_parallel(
multi_process_parallel(tp_size, 1, test_target) monkeypatch: pytest.MonkeyPatch,
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel(pp_size, test_target): def test_multi_process_pipeline_parallel(
multi_process_parallel(1, pp_size, test_target) monkeypatch: pytest.MonkeyPatch,
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel_pipeline_parallel( def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size, pp_size, test_target): tp_size: int,
multi_process_parallel(tp_size, pp_size, test_target) pp_size: int,
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import random import random
import pytest import pytest
@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): def graph_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): def eager_allreduce(
os.environ.pop("CUDA_VISIBLE_DEVICES", None) monkeypatch: pytest.MonkeyPatch,
tp_size,
pp_size,
rank,
distributed_init_port,
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): def test_custom_allreduce(
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(tp_size, pipeline_parallel_size, test_target) multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
test_target)

View File

@ -7,15 +7,17 @@ import pytest
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(): def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
def _verify(partition_str, num_layers, pp_size, goldens): def _verify(partition_str, num_layers, pp_size, goldens):
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
for pp_rank, golden in enumerate(goldens): for pp_rank, golden in enumerate(goldens):
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None: if bak is not None:
os.environ["VLLM_PP_LAYER_PARTITION"] = bak m.setenv("VLLM_PP_LAYER_PARTITION", bak)
# Even partition # Even partition
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)), (5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)), (5, 3, 2, (4, 5)),
]) ])
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, def test_uneven_auto_partition(
pp_rank: int, indices: tuple[int, int]): num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)

View File

@ -1,11 +1,15 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os from typing import TYPE_CHECKING
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"), (2, "JackFram/llama-160m"),
@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER", "FLASHINFER",
]) ])
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(
monkeypatch: pytest.MonkeyPatch,
PP_SIZE: int,
MODEL_NAME: str,
ATTN_BACKEND: LiteralString,
):
with monkeypatch.context() as m:
cudagraph_args = [ cudagraph_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
"--distributed-executor-backend", "--distributed-executor-backend",
"mp", "mp",
] ]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"] eager_args = cudagraph_args + ["--enforce-eager"]

View File

@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU") reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args) run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:

View File

@ -53,21 +53,29 @@ def cache_models():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models") @pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch): def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try: try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") m.setenv("HF_HUB_OFFLINE", "1")
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs): def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed") raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", m.setattr(
disable_connect) urllib3.connection.HTTPConnection,
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", "connect",
disable_connect) disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub and friends to setup offline mode # Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules() _re_import_modules()
# Cached model files should be used in offline mode # Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS: for model_config in MODEL_CONFIGS:
@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch):
finally: finally:
# Reset the environment after the test # Reset the environment after the test
# NB: Assuming tests are run in online mode # NB: Assuming tests are run in online mode
monkeypatch.delenv("HF_HUB_OFFLINE")
monkeypatch.delenv("VLLM_NO_USAGE_STATS")
_re_import_modules() _re_import_modules()
pass
def _re_import_modules(): def _re_import_modules():

View File

@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU") reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:

View File

@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@ -25,54 +24,67 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, use_v1: bool, device: str, monkeypatch): def test_env(
name: str,
use_v1: bool,
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") with monkeypatch.context() as m:
override_backend_env_variable(monkeypatch, name) m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, name)
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()): with patch("vllm.attention.selector.current_platform",
backend = get_attn_backend(16, torch.float16, torch.float16, 16, CpuPlatform()):
False) backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
assert backend.get_name() == "TORCH_SDPA" assert backend.get_name() == "TORCH_SDPA"
elif device == "hip": elif device == "hip":
with patch("vllm.attention.selector.current_platform", RocmPlatform()): with patch("vllm.attention.selector.current_platform",
backend = get_attn_backend(16, torch.float16, torch.float16, 16, RocmPlatform()):
False) backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
elif device == "openvino": elif device == "openvino":
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules', OpenVinoPlatform()), patch.dict('sys.modules',
{'openvino': Mock()}): {'openvino': Mock()}):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, backend = get_attn_backend(16, torch.float16, torch.float16,
False) 16, False)
assert backend.get_name() == "OPENVINO" assert backend.get_name() == "OPENVINO"
else: else:
if name in ["XFORMERS", "FLASHINFER"]: if name in ["XFORMERS", "FLASHINFER"]:
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
CudaPlatform()): CudaPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16,
16, False) torch.float16, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to # TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend # get_attn_backend
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=(7, 5)): monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
(7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Reset the monkeypatch for subsequent tests
monkeypatch.undo()
# Unsupported data type # Unsupported data type
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
@ -86,10 +98,19 @@ def test_flash_attn(monkeypatch):
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# flash-attn is not installed # flash-attn is not installed
with patch.dict('sys.modules', {'vllm_flash_attn': None}): import sys
original_module = sys.modules.get('vllm_flash_attn')
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False) backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
# Restore the original module if it existed
if original_module is not None:
monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
original_module)
else:
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
# Unsupported head size # Unsupported head size
backend = get_attn_backend(17, torch.float16, None, 16, False) backend = get_attn_backend(17, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
@ -100,12 +121,14 @@ def test_flash_attn(monkeypatch):
@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch): def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
"""Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()): with monkeypatch.context() as m, patch(
"vllm.attention.selector.current_platform", CudaPlatform()):
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
# Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False) backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED assert backend.get_name() == EXPECTED

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
@ -11,8 +9,9 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck(): def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
qweight = torch.randint(-2000000000, qweight = torch.randint(-2000000000,
2000000000, (8192, 256), 2000000000, (8192, 256),
device='cuda', device='cuda',
@ -29,8 +28,9 @@ def test_awq_dequantize_opcheck():
@pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.") reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck(): def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_USE_TRITON_AWQ"] = "0" with monkeypatch.context() as m:
m.setenv("VLLM_USE_TRITON_AWQ", "0")
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
qweight = torch.randint(-2000000000, qweight = torch.randint(-2000000000,
2000000000, (8192, 256), 2000000000, (8192, 256),

View File

@ -1,13 +1,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
def test_selector(monkeypatch): def test_selector(monkeypatch: pytest.MonkeyPatch):
"""Test that the attention selector for ROCm. with monkeypatch.context() as m:
""" m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
with patch("vllm.attention.selector.current_platform", RocmPlatform()): # Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform",
RocmPlatform())
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH" assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1") or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related # mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True) False, True)

View File

@ -12,11 +12,10 @@ import pytest
from tests.kernels.utils import override_backend_env_variable from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model @pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@ -55,13 +54,15 @@ def test_models(
backend: str, backend: str,
tensor_parallel_size: int, tensor_parallel_size: int,
disable_async_output_proc: bool, disable_async_output_proc: bool,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Only checks log probs match to cover the discrepancy in Only checks log probs match to cover the discrepancy in
numerical sensitive kernels. numerical sensitive kernels.
""" """
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8 NUM_LOG_PROBS = 8
@ -119,11 +120,14 @@ def test_cpu_models(
test_model: str, test_model: str,
max_tokens: int, max_tokens: int,
disable_async_output_proc: bool, disable_async_output_proc: bool,
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Only checks log probs match to cover the discrepancy in Only checks log probs match to cover the discrepancy in
numerical sensitive kernels. numerical sensitive kernels.
""" """
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8 NUM_LOG_PROBS = 8

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import importlib.util import importlib.util
import math import math
@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
import vllm import vllm
import vllm.config import vllm.config
from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
@ -29,9 +31,10 @@ def _arr(arr):
return array("i", arr) return array("i", arr)
def test_find_array(monkeypatch): def test_find_array(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler from vllm.model_executor.models.gritlm import GritLMPooler
@ -53,9 +56,6 @@ def test_find_array(monkeypatch):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server_embedding(): def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
with pytest.MonkeyPatch.context() as mp:
mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
@ -69,7 +69,10 @@ def server_generate():
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def client_embedding(server_embedding: RemoteOpenAIServer): async def client_embedding(monkeypatch: pytest.MonkeyPatch,
server_embedding: RemoteOpenAIServer):
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
async with server_embedding.get_async_client() as async_client: async with server_embedding.get_async_client() as async_client:
yield async_client yield async_client
@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
yield async_client yield async_client
def run_llm_encode(llm: vllm.LLM, queries: list[str], def run_llm_encode(
instruction: str) -> list[float]: llm: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = llm.encode([instruction + q for q in queries], ) outputs = llm.encode([instruction + q for q in queries], )
return [output.outputs.embedding for output in outputs] return [output.outputs.embedding for output in outputs]
async def run_client_embeddings(client: vllm.LLM, queries: list[str], async def run_client_embeddings(
instruction: str) -> list[float]: client: vllm.LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = await client.embeddings.create( outputs = await client.embeddings.create(
model=MODEL_NAME, model=MODEL_NAME,
input=[instruction + q for q in queries], input=[instruction + q for q in queries],
@ -106,7 +115,7 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm README.md in https://github.com/ContextualAI/gritlm
""" """
q_instruction = gritlm_instruction( q_instruction = gritlm_instruction(
"Given a scientific paper title, retrieve the paper's abstract") "Given a scientific paper title, retrieve the paper's abstract", )
queries = [ queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System", "Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning", "Generative Representational Instruction Tuning",
@ -136,9 +145,10 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch): def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
@ -160,7 +170,7 @@ def test_gritlm_offline_embedding(monkeypatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_embedding( async def test_gritlm_api_server_embedding(
client_embedding: openai.AsyncOpenAI): client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings( d_rep = await run_client_embeddings(

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@ -11,20 +9,28 @@ from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_plugin(dummy_opt_path, monkeypatch): def test_plugin(
monkeypatch: pytest.MonkeyPatch,
dummy_opt_path: str,
):
# V1 shuts down rather than raising an error here. # V1 shuts down rather than raising an error here.
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
os.environ["VLLM_PLUGINS"] = "" m.setenv("VLLM_USE_V1", "0")
m.setenv("VLLM_PLUGINS", "")
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy") LLM(model=dummy_opt_path, load_format="dummy")
error_msg = "has no vLLM implementation and " \ error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501
"the Transformers implementation is not compatible with vLLM"
assert (error_msg in str(excinfo.value)) assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_text_generation(dummy_opt_path): def test_oot_registration_text_generation(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
dummy_opt_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"] prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_opt_path, load_format="dummy") llm = LLM(model=dummy_opt_path, load_format="dummy")
@ -39,8 +45,12 @@ def test_oot_registration_text_generation(dummy_opt_path):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_embedding(dummy_gemma2_embedding_path): def test_oot_registration_embedding(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
dummy_gemma2_embedding_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"] prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
outputs = llm.embed(prompts) outputs = llm.embed(prompts)
@ -53,8 +63,12 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): def test_oot_registration_multimodal(
os.environ["VLLM_PLUGINS"] = "register_dummy_model" monkeypatch: pytest.MonkeyPatch,
dummy_llava_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = [{ prompts = [{
"prompt": "What's in the image?<image>", "prompt": "What's in the image?<image>",
"multi_modal_data": { "multi_modal_data": {

View File

@ -235,9 +235,11 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mp_crash_detection(monkeypatch): async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
parser = make_arg_parser(parser) parser = make_arg_parser(parser)
args = parser.parse_args([]) args = parser.parse_args([])
@ -245,14 +247,15 @@ async def test_mp_crash_detection(monkeypatch):
def mock_init(): def mock_init():
raise ValueError raise ValueError
monkeypatch.setattr(LLMEngine, "__init__", mock_init) m.setattr(LLMEngine, "__init__", mock_init)
start = time.perf_counter() start = time.perf_counter()
async with build_async_engine_client(args): async with build_async_engine_client(args):
pass pass
end = time.perf_counter() end = time.perf_counter()
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " assert end - start < 60, (
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup.") "if there is an error in the startup.")

View File

@ -5,7 +5,7 @@ from typing import Optional
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close from ..models.utils import check_logprobs_close
from ..utils import (completions_with_server_args, get_client_text_generations, from ..utils import (completions_with_server_args, get_client_text_generations,
@ -52,7 +52,7 @@ async def test_multi_step(
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
enable_chunked_prefill: bool, enable_chunked_prefill: bool,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment. client/server environment.
@ -82,7 +82,8 @@ async def test_multi_step(
pytest.skip("Multi-step with Chunked-Prefill only supports" pytest.skip("Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend") "PP=1 and FLASH_ATTN backend")
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
@ -135,8 +136,10 @@ async def test_multi_step(
# Assert multi-step scheduling produces nearly-identical logprobs # Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling. # to single-step scheduling.
ref_text_logprobs = get_client_text_logprob_generations(ref_completions) ref_text_logprobs = get_client_text_logprob_generations(
test_text_logprobs = get_client_text_logprob_generations(test_completions) ref_completions)
test_text_logprobs = get_client_text_logprob_generations(
test_completions)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=ref_text_logprobs, outputs_0_lst=ref_text_logprobs,
outputs_1_lst=test_text_logprobs, outputs_1_lst=test_text_logprobs,
@ -152,7 +155,7 @@ async def test_multi_step(
async def test_multi_step_pp_smoke( async def test_multi_step_pp_smoke(
tp_size: int, tp_size: int,
pp_size: int, pp_size: int,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Smoke test for the vLLM engine with multi-step scheduling in an Smoke test for the vLLM engine with multi-step scheduling in an
@ -174,7 +177,8 @@ async def test_multi_step_pp_smoke(
attention_backend = "FLASH_ATTN" attention_backend = "FLASH_ATTN"
max_num_seqs = 3 max_num_seqs = 3
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
# Prompt from the ShareGPT dataset # Prompt from the ShareGPT dataset
prompts = [ prompts = [

View File

@ -7,7 +7,7 @@ from typing import Optional
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
@ -42,7 +42,7 @@ def test_multi_step_llm(
num_prompts: int, num_prompts: int,
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine. """Test vLLM engine with multi-step scheduling via sync LLM Engine.
@ -70,7 +70,8 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned. completions endpoint; `None` -> 1 logprob returned.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
num_logprobs: Optional[int], num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int], num_prompt_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine. """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
@ -166,7 +167,8 @@ def test_multi_step_llm_w_prompt_logprobs(
note that this argument is not supported by the note that this argument is not supported by the
OpenAI completions endpoint. OpenAI completions endpoint.
""" """
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts prompts = example_prompts
if len(prompts) < num_prompts: if len(prompts) < num_prompts:
@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
num_prompts: int, num_prompts: int,
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC. """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
@ -293,13 +295,14 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
# #
# The Incorrect scheduling behavior - if it occurs - will cause an exception # The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`. # in the model runner resulting from `do_sample=False`.
override_backend_env_variable(monkeypatch, attention_backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
assert len(example_prompts) >= 2 assert len(example_prompts) >= 2
challenge_prompts = copy.deepcopy(example_prompts) challenge_prompts = copy.deepcopy(example_prompts)
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' challenge_prompts[0] = (
'inference and serving engine for LLMs.\n' 'vLLM is a high-throughput and memory-efficient '
) # 24 tok 'inference and serving engine for LLMs.\n') # 24 tok
challenge_prompts[1] = ( challenge_prompts[1] = (
'Briefly describe the major milestones in the ' 'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.\n' 'development of artificial intelligence from 1950 to 2020.\n'
@ -326,9 +329,9 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
max_num_seqs=4, max_num_seqs=4,
block_size=16, block_size=16,
) as vllm_model: ) as vllm_model:
outputs_baseline = (vllm_model.generate_greedy( outputs_baseline = (
challenge_prompts, max_tokens) if num_logprobs is None else vllm_model.generate_greedy(challenge_prompts, max_tokens) if
vllm_model.generate_greedy_logprobs( num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs)) challenge_prompts, max_tokens, num_logprobs))
# multi-step+"single-step chunked prefill"+APC # multi-step+"single-step chunked prefill"+APC
@ -346,9 +349,9 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
max_num_seqs=4, max_num_seqs=4,
block_size=16, block_size=16,
) as vllm_model: ) as vllm_model:
outputs_w_features = (vllm_model.generate_greedy( outputs_w_features = (
challenge_prompts, max_tokens) if num_logprobs is None else vllm_model.generate_greedy(challenge_prompts, max_tokens) if
vllm_model.generate_greedy_logprobs( num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs)) challenge_prompts, max_tokens, num_logprobs))
if num_logprobs is None: if num_logprobs is None:

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import neuronxcc.nki.language as nl import neuronxcc.nki.language as nl
import pytest import pytest
@ -99,6 +98,7 @@ def ref_block_tables_transform(
) )
@torch.inference_mode() @torch.inference_mode()
def test_load_and_transform_block_tables( def test_load_and_transform_block_tables(
monkeypatch: pytest.MonkeyPatch,
num_tiles, num_tiles,
num_blocks_per_tile, num_blocks_per_tile,
q_head_per_kv_head, q_head_per_kv_head,
@ -108,12 +108,12 @@ def test_load_and_transform_block_tables(
device = xm.xla_device() device = xm.xla_device()
compiler_flags = [ compiler_flags_str = " ".join([
"-O1", "-O1",
"--retry_failed_compilation", "--retry_failed_compilation",
] ])
compiler_flags_str = " ".join(compiler_flags) with monkeypatch.context() as m:
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(10000) torch.manual_seed(10000)
torch.set_printoptions(sci_mode=False) torch.set_printoptions(sci_mode=False)

View File

@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
]) ])
@torch.inference_mode() @torch.inference_mode()
def test_contexted_kv_attention( def test_contexted_kv_attention(
monkeypatch: pytest.MonkeyPatch,
prefill_batch_size: int, prefill_batch_size: int,
decode_batch_size: int, decode_batch_size: int,
num_heads: int, num_heads: int,
@ -329,7 +330,6 @@ def test_contexted_kv_attention(
large_tile_size, large_tile_size,
mixed_precision: bool, mixed_precision: bool,
) -> None: ) -> None:
import os
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
@ -340,12 +340,12 @@ def test_contexted_kv_attention(
device = xm.xla_device() device = xm.xla_device()
compiler_flags = [ compiler_flags_str = " ".join([
"-O1", "-O1",
"--retry_failed_compilation", "--retry_failed_compilation",
] ])
compiler_flags_str = " ".join(compiler_flags) with monkeypatch.context() as m:
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(0) torch.manual_seed(0)
torch.set_printoptions(sci_mode=False) torch.set_printoptions(sci_mode=False)
@ -415,7 +415,8 @@ def test_contexted_kv_attention(
num_active_blocks = pad_to_multiple(num_active_blocks, num_active_blocks = pad_to_multiple(num_active_blocks,
large_tile_size // block_size) large_tile_size // block_size)
context_kv_len = num_active_blocks * block_size context_kv_len = num_active_blocks * block_size
assert (context_kv_len % assert (
context_kv_len %
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
# pad QKV tensors # pad QKV tensors
@ -476,9 +477,11 @@ def test_contexted_kv_attention(
"constant", "constant",
0, 0,
).bool() ).bool()
attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1) attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
dim=1)
attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size) attn_mask = reorder_context_mask(attn_mask, large_tile_size,
block_size)
input_args = ( input_args = (
query.to(device=device), query.to(device=device),
@ -508,6 +511,7 @@ def test_contexted_kv_attention(
"constant", "constant",
0, 0,
) )
output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] output_ref = output_ref_padded.transpose(
0, 1)[0, :num_actual_tokens, :, :]
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)

View File

@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest
import torch import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import get_attn_backend from vllm.attention.selector import get_attn_backend
from vllm.utils import STR_INVALID_VAL from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
def test_platform_plugins(): def test_platform_plugins():
@ -25,8 +25,9 @@ def test_platform_plugins():
f" is loaded. The first import:\n{_init_trace}") f" is loaded. The first import:\n{_init_trace}")
def test_oot_attention_backend(monkeypatch): def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set # ignore the backend env variable if it is set
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "Dummy_Backend" assert backend.get_name() == "Dummy_Backend"

View File

@ -22,8 +22,9 @@ class DummyV1Scheduler(V1Scheduler):
raise Exception("Exception raised by DummyV1Scheduler") raise Exception("Exception raised by DummyV1Scheduler")
def test_scheduler_plugins_v0(monkeypatch): def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
with pytest.raises(Exception) as exception_info: with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs( engine_args = EngineArgs(
@ -38,14 +39,16 @@ def test_scheduler_plugins_v0(monkeypatch):
engine.add_request("0", "foo", sampling_params) engine.add_request("0", "foo", sampling_params)
engine.step() engine.step()
assert str(exception_info.value) == "Exception raised by DummyV0Scheduler" assert str(
exception_info.value) == "Exception raised by DummyV0Scheduler"
def test_scheduler_plugins_v1(monkeypatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1") with monkeypatch.context() as m:
# Explicitly turn off engine multiprocessing so that the scheduler runs in m.setenv("VLLM_USE_V1", "1")
# this process # Explicitly turn off engine multiprocessing so
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with pytest.raises(Exception) as exception_info: with pytest.raises(Exception) as exception_info:
@ -61,4 +64,5 @@ def test_scheduler_plugins_v1(monkeypatch):
engine.add_request("0", "foo", sampling_params) engine.add_request("0", "foo", sampling_params)
engine.step() engine.step()
assert str(exception_info.value) == "Exception raised by DummyV1Scheduler" assert str(
exception_info.value) == "Exception raised by DummyV1Scheduler"

View File

@ -4,25 +4,29 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`. Run `pytest tests/prefix_caching/test_prefix_caching.py`.
""" """
from __future__ import annotations
import pytest import pytest
from tests.conftest import VllmRunner from tests.conftest import VllmRunner
from tests.core.utils import SchedulerProxy, create_dummy_prompt from tests.core.utils import SchedulerProxy, create_dummy_prompt
from tests.kernels.utils import override_backend_env_variable
from vllm import SamplingParams, TokensPrompt from vllm import SamplingParams, TokensPrompt
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
This module relies on V0 internals, so set VLLM_USE_V1=0. This module relies on V0 internals, so set VLLM_USE_V1=0.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
MODELS = [ MODELS = [
@ -56,7 +60,7 @@ def test_mixed_requests(
cached_position: int, cached_position: int,
enable_chunked_prefill: bool, enable_chunked_prefill: bool,
block_size: int, block_size: int,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Test the case when some sequences have the prefix cache hit Test the case when some sequences have the prefix cache hit
@ -67,7 +71,8 @@ def test_mixed_requests(
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@ -81,11 +86,14 @@ def test_mixed_requests(
block_size=block_size, block_size=block_size,
) as vllm_model: ) as vllm_model:
# Run the first prompt so the cache is populated # Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) vllm_outputs = vllm_model.generate_greedy([cached_prompt],
max_tokens)
# Run all the promopts # Run all the promopts
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0,
req_outputs = vllm_model.model.generate(example_prompts, greedy_params) max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts,
greedy_params)
# Verify number of cached tokens # Verify number of cached tokens
for i in range(len(req_outputs)): for i in range(len(req_outputs)):
@ -95,8 +103,8 @@ def test_mixed_requests(
block_size) * block_size block_size) * block_size
else: else:
expected_num_cached_tokens = 0 expected_num_cached_tokens = 0
assert ( assert (req_outputs[i].num_cached_tokens ==
req_outputs[i].num_cached_tokens == expected_num_cached_tokens) expected_num_cached_tokens)
vllm_outputs = [( vllm_outputs = [(
output.prompt_token_ids + list(output.outputs[0].token_ids), output.prompt_token_ids + list(output.outputs[0].token_ids),
@ -115,14 +123,15 @@ def test_mixed_requests(
def test_unstable_prompt_sequence( def test_unstable_prompt_sequence(
vllm_runner, vllm_runner,
backend: str, backend: str,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
if backend == "FLASHINFER" and current_platform.is_rocm(): if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm(): if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.") pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with vllm_runner( with vllm_runner(
"Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct",

View File

@ -56,12 +56,11 @@ def test_gc():
assert allocated < 50 * 1024 * 1024 assert allocated < 50 * 1024 * 1024
def test_model_from_modelscope(monkeypatch): def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" with monkeypatch.context() as m:
monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") m.setenv("VLLM_USE_MODELSCOPE", "True")
try: llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
llm = LLM(model=MODELSCOPE_MODEL_NAME)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
assert len(outputs) == 4 assert len(outputs) == 4
finally:
monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
if __name__ == "__main__":
import pytest
pytest.main([__file__])

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# ruff: noqa
import asyncio import asyncio
import os
import socket import socket
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from unittest.mock import patch from unittest.mock import patch
@ -112,8 +112,9 @@ def test_deprecate_kwargs_additional_message():
dummy(old_arg=1) dummy(old_arg=1)
def test_get_open_port(): def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
os.environ["VLLM_PORT"] = "5678" with monkeypatch.context() as m:
m.setenv("VLLM_PORT", "5678")
# make sure we can get multiple ports, even if the env var is set # make sure we can get multiple ports, even if the env var is set
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
s1.bind(("localhost", get_open_port())) s1.bind(("localhost", get_open_port()))
@ -121,7 +122,6 @@ def test_get_open_port():
s2.bind(("localhost", get_open_port())) s2.bind(("localhost", get_open_port()))
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
s3.bind(("localhost", get_open_port())) s3.bind(("localhost", get_open_port()))
os.environ.pop("VLLM_PORT")
# Tests for FlexibleArgumentParser # Tests for FlexibleArgumentParser
@ -366,9 +366,10 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder(monkeypatch): def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet. # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
monkeypatch.setenv("VLLM_USE_V1", "0") with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
@ -9,10 +9,11 @@ from ..utils import compare_two_settings
# --enforce-eager on TPU causes graph compilation # --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine, # this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s # so we set the timeout here to 30s
os.environ["VLLM_RPC_TIMEOUT"] = "30000"
def test_custom_dispatcher(): def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_RPC_TIMEOUT", "30000")
compare_two_settings( compare_two_settings(
"google/gemma-2b", "google/gemma-2b",
arg1=[ arg1=[

View File

@ -1,10 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# ruff: noqa
# type: ignore
from __future__ import annotations
import os
import threading import threading
from collections.abc import Iterable from collections.abc import Iterable
from concurrent import futures from concurrent import futures
from typing import Callable, Literal from typing import Callable, Generator, Literal
import grpc import grpc
import pytest import pytest
@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch: pytest.MonkeyPatch):
""" """
Since this module is V0 only, set VLLM_USE_V1=0 for Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module. all tests in the module.
""" """
monkeypatch.setenv('VLLM_USE_V1', '0') with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
@pytest.fixture @pytest.fixture
def trace_service(): def trace_service() -> Generator[FakeTraceService, None, None]:
"""Fixture to set up a fake gRPC trace service""" """Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService() service = FakeTraceService()
@ -80,12 +84,18 @@ def trace_service():
server.stop(None) server.stop(None)
def test_traces(trace_service): def test_traces(
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(
temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256,
)
model = "facebook/opt-125m" model = "facebook/opt-125m"
llm = LLM( llm = LLM(
model=model, model=model,
@ -120,17 +130,19 @@ def test_traces(trace_service):
) == sampling_params.temperature ) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
) == sampling_params.max_tokens
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids) outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics metrics = outputs[0].metrics
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
@ -145,12 +157,18 @@ def test_traces(trace_service):
assert metrics.model_execute_time is None assert metrics.model_execute_time is None
def test_traces_with_detailed_steps(trace_service): def test_traces_with_detailed_steps(
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(
temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256,
)
model = "facebook/opt-125m" model = "facebook/opt-125m"
llm = LLM( llm = LLM(
model=model, model=model,
@ -186,17 +204,19 @@ def test_traces_with_detailed_steps(trace_service):
) == sampling_params.temperature ) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
) == sampling_params.max_tokens
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n assert attributes.get(
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids) outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics metrics = outputs[0].metrics
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
@ -207,9 +227,10 @@ def test_traces_with_detailed_steps(trace_service):
) == metrics.scheduler_time ) == metrics.scheduler_time
assert metrics.model_forward_time > 0 assert metrics.model_forward_time > 0
assert attributes.get( assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
metrics.model_forward_time / 1000) ) == pytest.approx(metrics.model_forward_time / 1000)
assert metrics.model_execute_time > 0 assert metrics.model_execute_time > 0
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
) == metrics.model_execute_time ) == metrics.model_execute_time
assert metrics.model_forward_time < 1000 * metrics.model_execute_time assert metrics.model_forward_time < 1000 * metrics.model_execute_time

View File

@ -566,6 +566,7 @@ def init_test_distributed_environment(
def multi_process_parallel( def multi_process_parallel(
monkeypatch: pytest.MonkeyPatch,
tp_size: int, tp_size: int,
pp_size: int, pp_size: int,
test_target: Any, test_target: Any,
@ -582,7 +583,13 @@ def multi_process_parallel(
refs = [] refs = []
for rank in range(tp_size * pp_size): for rank in range(tp_size * pp_size):
refs.append( refs.append(
test_target.remote(tp_size, pp_size, rank, distributed_init_port)) test_target.remote(
monkeypatch,
tp_size,
pp_size,
rank,
distributed_init_port,
), )
ray.get(refs) ray.get(refs)
ray.shutdown() ray.shutdown()

View File

@ -1,5 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import random import random
from typing import Any
import pytest import pytest
@ -50,8 +53,12 @@ def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct" return "meta-llama/Meta-Llama-3-8B-Instruct"
def test_ngram_correctness(monkeypatch, test_prompts, sampling_config, def test_ngram_correctness(
model_name): monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
):
''' '''
Compare the outputs of a original LLM and a speculative LLM Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding. should be the same when using ngram speculative decoding.

View File

@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT), [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)]) (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load(monkeypatch, output_kind: RequestOutputKind, async def test_load(
engine_args_and_prompt: tuple[AsyncEngineArgs, monkeypatch: pytest.MonkeyPatch,
PromptType]): output_kind: RequestOutputKind,
engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the # so that in the future when we switch, we don't have to change all the
# tests. # tests.
@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT), [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)]) (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort(monkeypatch, output_kind: RequestOutputKind, async def test_abort(monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args_and_prompt: tuple[AsyncEngineArgs, engine_args_and_prompt: tuple[AsyncEngineArgs,
PromptType]): PromptType]):

View File

@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core(monkeypatch): def test_engine_core(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
@ -159,7 +159,7 @@ def test_engine_core(monkeypatch):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core_advanced_sampling(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
""" """
A basic end-to-end test to verify that the engine functions correctly A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and when additional sampling parameters, such as top_p, min_tokens, and
@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_engine_core_concurrent_batches(monkeypatch): def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
""" """
Test that the engine can handle multiple concurrent batches. Test that the engine can handle multiple concurrent batches.
""" """

View File

@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
@fork_new_process_for_each_test @fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False]) @pytest.mark.parametrize("multiprocessing_mode", [True, False])
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")

View File

@ -255,12 +255,10 @@ def _run_and_validate(
[NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]) [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
@pytest.mark.parametrize("temperature", [0.0, 2.0]) @pytest.mark.parametrize("temperature", [0.0, 2.0])
def test_get_logprobs_and_prompt_logprobs( def test_get_logprobs_and_prompt_logprobs(
hf_model, hf_model, vllm_model,
vllm_model,
batch_logprobs_composition: BatchLogprobsComposition, batch_logprobs_composition: BatchLogprobsComposition,
temperature: float, temperature: float, example_prompts: list[str],
example_prompts, monkeypatch: pytest.MonkeyPatch) -> None:
) -> None:
"""Test V1 Engine logprobs & prompt logprobs """Test V1 Engine logprobs & prompt logprobs
Exercise a variety of combinations of `logprobs` and `prompt_logprobs` Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
@ -287,6 +285,8 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter temperature: "temperature" sampling parameter
example_prompts: example prompt fixture example_prompts: example prompt fixture
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
if do_apc and (temperature < 2.0 if do_apc and (temperature < 2.0
or batch_logprobs_composition != SAMPLE_PROMPT): or batch_logprobs_composition != SAMPLE_PROMPT):
@ -306,7 +306,8 @@ def test_get_logprobs_and_prompt_logprobs(
# Batch has mixed sample params # Batch has mixed sample params
# (different logprobs/prompt logprobs combos) # (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) logprob_prompt_logprob_list = get_test_batch(
batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing # Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config( logprob_prompt_logprob_list = _repeat_logprob_config(
@ -333,16 +334,13 @@ def test_get_logprobs_and_prompt_logprobs(
do_apc=do_apc) do_apc=do_apc)
def test_max_logprobs(): def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs` """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation. APC should not matter as this test checks basic request validation.
Args:
monkeypatch
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner("facebook/opt-125m", runner = VllmRunner("facebook/opt-125m",
max_logprobs=1, max_logprobs=1,
@ -354,40 +352,52 @@ def test_max_logprobs():
bad_sampling_params = SamplingParams(logprobs=2) bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params) runner.generate(["Hello world"],
sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts): def test_none_logprobs(vllm_model, example_prompts,
monkeypatch: pytest.MonkeyPatch):
"""Engine should return `logprobs` and `prompt_logprobs` as `None` """Engine should return `logprobs` and `prompt_logprobs` as `None`
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5 max_tokens = 5
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens,
logprobs=None, logprobs=None,
prompt_logprobs=None, prompt_logprobs=None,
temperature=0.0) temperature=0.0,
)
results_logprobs_none = vllm_model.model.generate( results_logprobs_none = vllm_model.model.generate(
example_prompts, sampling_params=sampling_params_logprobs_none) example_prompts,
sampling_params=sampling_params_logprobs_none,
)
for i in range(len(results_logprobs_none)): for i in range(len(results_logprobs_none)):
# Check sample logprobs are None # Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None assert results_logprobs_none[i].outputs[
0].cumulative_logprob is None
# Check prompt logprobs are None # Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts): def test_zero_logprobs(vllm_model, example_prompts,
monkeypatch: pytest.MonkeyPatch):
"""Engine should return sampled token and prompt token logprobs """Engine should return sampled token and prompt token logprobs
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5 max_tokens = 5
sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,

View File

@ -3,11 +3,16 @@
Run `pytest tests/v1/tpu/test_basic.py`. Run `pytest tests/v1/tpu/test_basic.py`.
""" """
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...conftest import VllmRunner if TYPE_CHECKING:
from tests.conftest import VllmRunner
MODELS = [ MODELS = [
# "Qwen/Qwen2-7B-Instruct", # "Qwen/Qwen2-7B-Instruct",
@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES) @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
def test_models( def test_models(
monkeypatch, vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str, model: str,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
@ -41,7 +47,7 @@ def test_models(
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
with VllmRunner( with vllm_runner(
model, model,
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,