[Misc] Replace os environ to monkeypatch in test suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-17 11:35:57 +08:00 · 2025-03-17 11:35:57 +08:00 · a73e183e36
commit a73e183e36
parent 1e799b7ec1
43 changed files with 1900 additions and 1658 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -522,7 +522,7 @@ steps:
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@ -63,7 +64,8 @@ def test_models(
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", backend)
        # 5042 tokens for gemma2
        # gemma2 has alternating sliding window size of 4096
@ -80,7 +82,8 @@ def test_models(
                        dtype=dtype,
                        enforce_eager=enforce_eager,
                        gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@ -104,6 +107,7 @@ def test_models(
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    vllm_runner,
    example_prompts,
@ -116,13 +120,17 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
    with monkeypatch.context() as monkeypatch_context:
        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
            # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
        if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+            monkeypatch_context.setenv(
                "VLLM_ATTENTION_BACKEND",
                attention_backend,
            )
        dtype = "half"
        max_tokens = 5
@ -130,13 +138,16 @@ def test_models_distributed(
        # NOTE: take care of the order. run vLLM first, and then run HF.
        # vLLM needs a fresh new process without cuda initialization.
        # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
+        # will hurt multiprocessing backend with fork method
-    with vllm_runner(model,
+        # (the default method).
        with vllm_runner(
                model,
                dtype=dtype,
                tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
+                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -7,16 +7,22 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 if TYPE_CHECKING:
    from .conftest import HfRunner, VllmRunner
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-3.2-1B-Instruct",
@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the file.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
@pytest.mark.parametrize("model", MODELS)
@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -52,13 +60,14 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        max_num_seqs = chunked_prefill_token_size
        max_num_batched_tokens = chunked_prefill_token_size
@ -75,7 +84,8 @@ def test_models(
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@ -90,21 +100,21 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
-
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        if (model == "meta-llama/Llama-3.2-1B-Instruct"
                and distributed_executor_backend == "ray"):
            # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
        dtype = "half"
        max_tokens = 5
@ -119,7 +129,8 @@ def test_models_distributed(
        # NOTE: take care of the order. run vLLM first, and then run HF.
        # vLLM needs a fresh new process without cuda initialization.
        # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
+        # will hurt multiprocessing backend with
        # fork method (the default method).
        with vllm_runner(
                model,
@ -130,7 +141,10 @@ def test_models_distributed(
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(
                example_prompts,
                max_tokens,
            )
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@ -158,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    kv_cache_dtype: str,
    model: str,
@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@ -254,8 +268,10 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
+                outputs[enable] += vllm_model.generate_greedy(
-                                                              max_tokens)
+                    [prompt],
                    max_tokens,
                )
    check_outputs_equal(
        outputs_0_lst=outputs[False],
@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -283,7 +299,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    test_models(
        hf_runner,
@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -123,9 +123,9 @@ def test_cumem_with_cudagraph():
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model: str, use_v1: bool):
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    import os
+    with monkeypatch.context() as m:
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        free, total = torch.cuda.mem_get_info()
        used_bytes_baseline = total - free  # in case other process is running
        llm = LLM(model, enable_sleep_mode=True)
@ -158,5 +158,3 @@ def test_end_to_end(model: str, use_v1: bool):
        # cmp output
        assert output[0].outputs[0].text == output2[0].outputs[0].text
    del os.environ["VLLM_USE_V1"]
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import dataclasses
 from typing import Optional
 import pytest
@ -22,8 +22,11 @@ class TestSetting:
    fullgraph: bool
-# representative settings for testing
+# we cannot afford testing the full Catesian product
-test_settings = [
+# of all models and all levels
@pytest.mark.parametrize(
    "test_setting",
    [
        # basic llama model
        TestSetting(
            model="meta-llama/Llama-3.2-1B-Instruct",
@ -84,13 +87,11 @@ test_settings = [
            method="generate_with_image",
            fullgraph=False,
        ),
-]
+    ])
-
+def test_compile_correctness(
-
+    monkeypatch: pytest.MonkeyPatch,
-# we cannot afford testing the full Catesian product
+    test_setting: TestSetting,
-# of all models and all levels
+):
@pytest.mark.parametrize("test_setting", test_settings)
 def test_compile_correctness(test_setting: TestSetting):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@ -103,13 +104,17 @@ def test_compile_correctness(test_setting: TestSetting):
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
-    import os
+
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
+    with monkeypatch.context() as m:
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-                ["-tp", str(tp_size)]
+        final_args = [
            "--enforce-eager", *model_args, "-pp",
            str(pp_size), "-tp",
            str(tp_size)
        ]
        all_args: list[list[str]] = []
-    all_envs: list[Optional[dict[str, str]]] = []
+        all_envs: list[dict[str, str] | None] = []
        for level in [
                CompilationLevel.NO_COMPILATION,
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,22 +1,115 @@
 # SPDX-License-Identifier: Apache-2.0
-import pytest
+from __future__ import annotations
 from typing import Any
 import pytest
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
 def models_list_fixture(request):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
            "quantization": "compressed-tensors"
        }),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]
    if is_quant_method_supported("aqlm"):
        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
            "quantization": "aqlm"
        }))
    # TODO: figure out why this fails.
    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
            "quantization": "gguf"
        }))
    if is_quant_method_supported("gptq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
            "quantization": "gptq"
        }))
    if is_quant_method_supported("gptq_marlin"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
            "quantization": "gptq_marlin"
        }))
    if is_quant_method_supported("gptq_marlin_24"):
        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
            "quantization": "gptq_marlin_24"
        }))
    if is_quant_method_supported("marlin"):
        TEST_MODELS.append(
            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
                "quantization": "marlin"
            }))
    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
            "quantization": "AWQ"
        }))
    return TEST_MODELS
@pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
+def test_full_graph(
-    model = model_info[0]
+    monkeypatch: pytest.MonkeyPatch,
-    model_kwargs = model_info[1]
+    model_info: tuple[str, dict[str, Any]],
-    check_full_graph_support(model,
+    optimization_level: int,
-                             model_kwargs,
+):
-                             optimization_level,
+    model, model_kwargs = model_info
-                             tp_size=1)
+
    with monkeypatch.context() as m:
        # make sure these models can be captured in full graph mode
        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")
        prompts = [
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
        sampling_params = SamplingParams(temperature=0)
        llm = LLM(
            model=model,
            enforce_eager=True,
            tensor_parallel_size=1,
            disable_custom_all_reduce=True,
            compilation_config=optimization_level,
            **model_kwargs,
        )
        outputs = llm.generate(prompts, sampling_params)
        # Print the outputs.
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@ -1,93 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
 TEST_MODELS = [
    ("facebook/opt-125m", {}),
    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
        "quantization": "compressed-tensors"
    }),
    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]
 if is_quant_method_supported("aqlm"):
    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
        "quantization": "aqlm"
    }))
 # TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
        "quantization": "gguf"
    }))
 if is_quant_method_supported("gptq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
        "quantization": "gptq"
    }))
 if is_quant_method_supported("gptq_marlin"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
        "quantization": "gptq_marlin"
    }))
 if is_quant_method_supported("gptq_marlin_24"):
    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
        "quantization": "gptq_marlin_24"
    }))
 if is_quant_method_supported("marlin"):
    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
        "quantization": "marlin"
    }))
 if not current_platform.is_rocm() and is_quant_method_supported("awq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
        "quantization": "AWQ"
    }))
 def check_full_graph_support(model,
                             model_kwargs,
                             optimization_level,
                             tp_size=1):
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
    print(f"MODEL={model}")
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
    llm = LLM(model=model,
              enforce_eager=True,
              tensor_parallel_size=tp_size,
              disable_custom_all_reduce=True,
              compilation_config=optimization_level,
              **model_kwargs)
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -3,7 +3,10 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
 from __future__ import annotations
 from typing import Any, Callable
 import pytest
 import ray
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
+def all_reduce_test_worker(
-                           distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
+def all_gather_test_worker(
-                           distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+def broadcast_tensor_dict_test_worker(
-                                      distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+def send_recv_tensor_dict_test_worker(
-                                      distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+def send_recv_test_worker(
-                          distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
    all_reduce_test_worker, all_gather_test_worker,
    broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
+def test_multi_process_tensor_parallel(
-    multi_process_parallel(tp_size, 1, test_target)
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
+def test_multi_process_pipeline_parallel(
-    multi_process_parallel(1, pp_size, test_target)
+    monkeypatch: pytest.MonkeyPatch,
    pp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
+    tp_size: int,
-    multi_process_parallel(tp_size, pp_size, test_target)
+    pp_size: int,
    test_target: Callable[..., Any],
    monkeypatch: pytest.MonkeyPatch,
 ):
    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import random
 import pytest
@ -23,8 +22,15 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
+def graph_allreduce(
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch: pytest.MonkeyPatch,
    tp_size,
    pp_size,
    rank,
    distributed_init_port,
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
@ -79,8 +85,15 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
+def eager_allreduce(
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch: pytest.MonkeyPatch,
    tp_size,
    pp_size,
    rank,
    distributed_init_port,
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
@ -110,8 +123,14 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
    monkeypatch: pytest.MonkeyPatch,
    tp_size,
    pipeline_parallel_size,
    test_target,
 ):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
                           test_target)
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@ -7,15 +7,17 @@ import pytest
 from vllm.distributed.utils import get_pp_indices
-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        def _verify(partition_str, num_layers, pp_size, goldens):
            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
            for pp_rank, golden in enumerate(goldens):
                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
            if bak is not None:
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
        # Even partition
        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@ -55,6 +57,10 @@ def test_custom_layer_partition():
        (5, 3, 1, (2, 4)),
        (5, 3, 2, (4, 5)),
    ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
+def test_uneven_auto_partition(
-                               pp_rank: int, indices: tuple[int, int]):
+    num_hidden_layers: int,
    pp_size: int,
    pp_rank: int,
    indices: tuple[int, int],
 ):
    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
-import os
+from typing import TYPE_CHECKING
 import pytest
 from ..utils import compare_two_settings, fork_new_process_for_each_test
 if TYPE_CHECKING:
    from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
    (2, "JackFram/llama-160m"),
@ -15,7 +19,13 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
    "FLASHINFER",
 ])
@fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+def test_pp_cudagraph(
    monkeypatch: pytest.MonkeyPatch,
    PP_SIZE: int,
    MODEL_NAME: str,
    ATTN_BACKEND: LiteralString,
 ):
    with monkeypatch.context() as m:
        cudagraph_args = [
            # use half precision for speed and memory savings in CI environment
            "--dtype",
@ -25,7 +35,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
            "--distributed-executor-backend",
            "mp",
        ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
        eager_args = cudagraph_args + ["--enforce-eager"]
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -49,7 +49,7 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
        run_test(more_args)
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V0 Engine."""
    with monkeypatch.context() as m:
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -53,21 +53,29 @@ def cache_models():
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
    with monkeypatch.context() as m:
        try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
            def disable_connect(*args, **kwargs):
                raise RuntimeError("No http calls allowed")
-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
+            m.setattr(
-                            disable_connect)
+                urllib3.connection.HTTPConnection,
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
+                "connect",
-                            disable_connect)
+                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
-        # Need to re-import huggingface_hub and friends to setup offline mode
+            # Need to re-import huggingface_hub
            # and friends to setup offline mode
            _re_import_modules()
            # Cached model files should be used in offline mode
            for model_config in MODEL_CONFIGS:
@ -75,10 +83,7 @@ def test_offline_mode(monkeypatch):
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
        monkeypatch.delenv("HF_HUB_OFFLINE")
        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
            _re_import_modules()
        pass
 def _re_import_modules():
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
                                    more_args):
    """Run with the V0 Engine."""
    with monkeypatch.context() as m:
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True)
@ -25,54 +24,67 @@ def clear_cache():
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
    name: str,
    use_v1: bool,
    device: str,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.
    """
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+    with monkeypatch.context() as m:
-    override_backend_env_variable(monkeypatch, name)
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            with patch("vllm.attention.selector.current_platform",
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                       CpuPlatform()):
-                                       False)
+                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
            assert backend.get_name() == "TORCH_SDPA"
        elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+            with patch("vllm.attention.selector.current_platform",
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                       RocmPlatform()):
-                                       False)
+                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
            assert backend.get_name() == EXPECTED
        elif device == "openvino":
            with patch("vllm.attention.selector.current_platform",
                       OpenVinoPlatform()), patch.dict('sys.modules',
                                                       {'openvino': Mock()}):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                       False)
+                                           16, False)
            assert backend.get_name() == "OPENVINO"
        else:
            if name in ["XFORMERS", "FLASHINFER"]:
                with patch("vllm.attention.selector.current_platform",
                           CudaPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
+                    backend = get_attn_backend(16, torch.float16,
-                                           16, False)
+                                               torch.float16, 16, False)
                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                assert backend.get_name() == EXPECTED
-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
    # get_attn_backend
-    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
        # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
                            (7, 5))
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
        # Reset the monkeypatch for subsequent tests
        monkeypatch.undo()
        # Unsupported data type
        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
@ -86,10 +98,19 @@ def test_flash_attn(monkeypatch):
        assert backend.get_name() != STR_FLASH_ATTN_VAL
        # flash-attn is not installed
-    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+        import sys
        original_module = sys.modules.get('vllm_flash_attn')
        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
        # Restore the original module if it existed
        if original_module is not None:
            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
                                original_module)
        else:
            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
        # Unsupported head size
        backend = get_attn_backend(17, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
@ -100,12 +121,14 @@ def test_flash_attn(monkeypatch):
@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
    """Ignore the invalid env variable if it is set."""
    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+    with monkeypatch.context() as m, patch(
            "vllm.attention.selector.current_platform", CudaPlatform()):
        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
        # Test with head size 32
        backend = get_attn_backend(32, torch.float16, None, 16, False)
        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
        assert backend.get_name() == EXPECTED
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import pytest
 import torch
@ -11,8 +9,9 @@ from vllm import _custom_ops as ops  # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_TRITON_AWQ", "0")
        qweight = torch.randint(-2000000000,
                                2000000000, (8192, 256),
                                device='cuda',
@ -29,8 +28,9 @@ def test_awq_dequantize_opcheck():
@pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_TRITON_AWQ", "0")
        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
        qweight = torch.randint(-2000000000,
                                2000000000, (8192, 256),
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import patch
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True)
@ -17,15 +15,19 @@ def clear_cache():
    _cached_get_attn_backend.cache_clear()
-def test_selector(monkeypatch):
+def test_selector(monkeypatch: pytest.MonkeyPatch):
-    """Test that the attention selector for ROCm.
+    with monkeypatch.context() as m:
-    """
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
-    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        # Set the current platform to ROCm using monkeypatch
        monkeypatch.setattr("vllm.attention.selector.current_platform",
                            RocmPlatform())
        # Test standard ROCm attention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert (backend.get_name() == "ROCM_FLASH"
                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
        # mla test for deepseek related
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                   False, True)
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@ -12,11 +12,10 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@ -55,13 +54,15 @@ def test_models(
    backend: str,
    tensor_parallel_size: int,
    disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
        m.setenv(STR_BACKEND_ENV_VAR, backend)
        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8
@ -119,11 +120,14 @@ def test_cpu_models(
    test_model: str,
    max_tokens: int,
    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import importlib.util
 import math
@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
 import vllm
 import vllm.config
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ....utils import RemoteOpenAIServer
@ -29,9 +31,10 @@ def _arr(arr):
    return array("i", arr)
-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
        from vllm.model_executor.models.gritlm import GritLMPooler
@ -53,9 +56,6 @@ def test_find_array(monkeypatch):
@pytest.fixture(scope="module")
 def server_embedding():
    # GritLM embedding implementation is only supported by XFormers backend.
    with pytest.MonkeyPatch.context() as mp:
        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
@ -69,7 +69,10 @@ def server_generate():
@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
                           server_embedding: RemoteOpenAIServer):
    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
        async with server_embedding.get_async_client() as async_client:
            yield async_client
@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
        yield async_client
-def run_llm_encode(llm: vllm.LLM, queries: list[str],
+def run_llm_encode(
-                   instruction: str) -> list[float]:
+    llm: vllm.LLM,
    queries: list[str],
    instruction: str,
 ) -> list[float]:
    outputs = llm.encode([instruction + q for q in queries], )
    return [output.outputs.embedding for output in outputs]
-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
+async def run_client_embeddings(
-                                instruction: str) -> list[float]:
+    client: vllm.LLM,
    queries: list[str],
    instruction: str,
 ) -> list[float]:
    outputs = await client.embeddings.create(
        model=MODEL_NAME,
        input=[instruction + q for q in queries],
@ -106,7 +115,7 @@ def get_test_data():
    README.md in https://github.com/ContextualAI/gritlm
    """
    q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract")
+        "Given a scientific paper title, retrieve the paper's abstract", )
    queries = [
        "Bitcoin: A Peer-to-Peer Electronic Cash System",
        "Generative Representational Instruction Tuning",
@ -136,9 +145,10 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
        queries, q_instruction, documents, d_instruction = get_test_data()
@ -160,7 +170,7 @@ def test_gritlm_offline_embedding(monkeypatch):
@pytest.mark.asyncio
 async def test_gritlm_api_server_embedding(
-        client_embedding: openai.AsyncOpenAI):
+    client_embedding: openai.AsyncOpenAI, ):
    queries, q_instruction, documents, d_instruction = get_test_data()
    d_rep = await run_client_embeddings(
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import pytest
 from vllm import LLM, SamplingParams
@ -11,20 +9,28 @@ from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
    monkeypatch: pytest.MonkeyPatch,
    dummy_opt_path: str,
 ):
    # V1 shuts down rather than raising an error here.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
-    os.environ["VLLM_PLUGINS"] = ""
+        m.setenv("VLLM_USE_V1", "0")
        m.setenv("VLLM_PLUGINS", "")
        with pytest.raises(Exception) as excinfo:
            LLM(model=dummy_opt_path, load_format="dummy")
-    error_msg = "has no vLLM implementation and " \
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
                "the Transformers implementation is not compatible with vLLM"
        assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
+def test_oot_registration_text_generation(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
    dummy_opt_path: str,
 ):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
        sampling_params = SamplingParams(temperature=0)
        llm = LLM(model=dummy_opt_path, load_format="dummy")
@ -39,8 +45,12 @@ def test_oot_registration_text_generation(dummy_opt_path):
@fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
+def test_oot_registration_embedding(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
    dummy_gemma2_embedding_path: str,
 ):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
        outputs = llm.embed(prompts)
@ -53,8 +63,12 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
+def test_oot_registration_multimodal(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
    dummy_llava_path: str,
 ):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = [{
            "prompt": "What's in the image?<image>",
            "multi_modal_data": {
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@ -235,9 +235,11 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+        parser = FlexibleArgumentParser(
            description="vLLM's remote OpenAI server.")
        parser = make_arg_parser(parser)
        args = parser.parse_args([])
@ -245,14 +247,15 @@ async def test_mp_crash_detection(monkeypatch):
        def mock_init():
            raise ValueError
-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)
        start = time.perf_counter()
        async with build_async_engine_client(args):
            pass
        end = time.perf_counter()
-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
+        assert end - start < 60, (
            "Expected vLLM to gracefully shutdown in <60s "
            "if there is an error in the startup.")
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@ -5,7 +5,7 @@ from typing import Optional
 import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@ -52,7 +52,7 @@ async def test_multi_step(
    num_logprobs: Optional[int],
    attention_backend: str,
    enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@ -82,7 +82,8 @@ async def test_multi_step(
        pytest.skip("Multi-step with Chunked-Prefill only supports"
                    "PP=1 and FLASH_ATTN backend")
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        prompts = example_prompts
        if len(prompts) < num_prompts:
@ -135,8 +136,10 @@ async def test_multi_step(
        # Assert multi-step scheduling produces nearly-identical logprobs
        # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
+        ref_text_logprobs = get_client_text_logprob_generations(
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
+            ref_completions)
        test_text_logprobs = get_client_text_logprob_generations(
            test_completions)
        check_logprobs_close(
            outputs_0_lst=ref_text_logprobs,
            outputs_1_lst=test_text_logprobs,
@ -152,7 +155,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
    tp_size: int,
    pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Smoke test for the vLLM engine with multi-step scheduling in an
@ -174,7 +177,8 @@ async def test_multi_step_pp_smoke(
    attention_backend = "FLASH_ATTN"
    max_num_seqs = 3
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        # Prompt from the ShareGPT dataset
        prompts = [
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@ -7,7 +7,7 @@ from typing import Optional
 import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close, check_outputs_equal
@ -42,7 +42,7 @@ def test_multi_step_llm(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
@ -70,7 +70,8 @@ def test_multi_step_llm(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> 1 logprob returned.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        prompts = example_prompts
        if len(prompts) < num_prompts:
@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
    num_logprobs: Optional[int],
    num_prompt_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
@ -166,7 +167,8 @@ def test_multi_step_llm_w_prompt_logprobs(
                           note that this argument is not supported by the
                           OpenAI completions endpoint.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        prompts = example_prompts
        if len(prompts) < num_prompts:
@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
@ -293,13 +295,14 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    #
    # The Incorrect scheduling behavior - if it occurs - will cause an exception
    # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        assert len(example_prompts) >= 2
        challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
+        challenge_prompts[0] = (
-                            'inference and serving engine for LLMs.\n'
+            'vLLM is a high-throughput and memory-efficient '
-                            )  # 24 tok
+            'inference and serving engine for LLMs.\n')  # 24 tok
        challenge_prompts[1] = (
            'Briefly describe the major milestones in the '
            'development of artificial intelligence from 1950 to 2020.\n'
@ -326,9 +329,9 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
                max_num_seqs=4,
                block_size=16,
        ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
+            outputs_baseline = (
-            challenge_prompts, max_tokens) if num_logprobs is None else
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                            vllm_model.generate_greedy_logprobs(
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
                    challenge_prompts, max_tokens, num_logprobs))
        # multi-step+"single-step chunked prefill"+APC
@ -346,9 +349,9 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
                max_num_seqs=4,
                block_size=16,
        ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
+            outputs_w_features = (
-            challenge_prompts, max_tokens) if num_logprobs is None else
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                              vllm_model.generate_greedy_logprobs(
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
                    challenge_prompts, max_tokens, num_logprobs))
        if num_logprobs is None:
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import neuronxcc.nki.language as nl
 import pytest
@ -99,6 +98,7 @@ def ref_block_tables_transform(
 )
@torch.inference_mode()
 def test_load_and_transform_block_tables(
    monkeypatch: pytest.MonkeyPatch,
    num_tiles,
    num_blocks_per_tile,
    q_head_per_kv_head,
@ -108,12 +108,12 @@ def test_load_and_transform_block_tables(
    device = xm.xla_device()
-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
+    ])
-    compiler_flags_str = " ".join(compiler_flags)
+    with monkeypatch.context() as m:
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
        torch.manual_seed(10000)
        torch.set_printoptions(sci_mode=False)
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
    ])
@torch.inference_mode()
 def test_contexted_kv_attention(
    monkeypatch: pytest.MonkeyPatch,
    prefill_batch_size: int,
    decode_batch_size: int,
    num_heads: int,
@ -329,7 +330,6 @@ def test_contexted_kv_attention(
    large_tile_size,
    mixed_precision: bool,
 ) -> None:
    import os
    import torch_xla.core.xla_model as xm
@ -340,12 +340,12 @@ def test_contexted_kv_attention(
    device = xm.xla_device()
-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
+    ])
-    compiler_flags_str = " ".join(compiler_flags)
+    with monkeypatch.context() as m:
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
        torch.manual_seed(0)
        torch.set_printoptions(sci_mode=False)
@ -415,7 +415,8 @@ def test_contexted_kv_attention(
        num_active_blocks = pad_to_multiple(num_active_blocks,
                                            large_tile_size // block_size)
        context_kv_len = num_active_blocks * block_size
-    assert (context_kv_len %
+        assert (
            context_kv_len %
            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
        # pad QKV tensors
@ -476,9 +477,11 @@ def test_contexted_kv_attention(
            "constant",
            0,
        ).bool()
-    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
                                 dim=1)
-    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
                                         block_size)
        input_args = (
            query.to(device=device),
@ -508,6 +511,7 @@ def test_contexted_kv_attention(
            "constant",
            0,
        )
-    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+        output_ref = output_ref_padded.transpose(
            0, 1)[0, :num_actual_tokens, :, :]
        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 def test_platform_plugins():
@ -25,8 +25,9 @@ def test_platform_plugins():
        f" is loaded. The first import:\n{_init_trace}")
-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
    # ignore the backend env variable if it is set
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert backend.get_name() == "Dummy_Backend"
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -22,8 +22,9 @@ class DummyV1Scheduler(V1Scheduler):
        raise Exception("Exception raised by DummyV1Scheduler")
-def test_scheduler_plugins_v0(monkeypatch):
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
        with pytest.raises(Exception) as exception_info:
            engine_args = EngineArgs(
@ -38,14 +39,16 @@ def test_scheduler_plugins_v0(monkeypatch):
            engine.add_request("0", "foo", sampling_params)
            engine.step()
-    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(
            exception_info.value) == "Exception raised by DummyV0Scheduler"
-def test_scheduler_plugins_v1(monkeypatch):
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+    with monkeypatch.context() as m:
-    # Explicitly turn off engine multiprocessing so that the scheduler runs in
+        m.setenv("VLLM_USE_V1", "1")
-    # this process
+        # Explicitly turn off engine multiprocessing so
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        # that the scheduler runs in this process
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
        with pytest.raises(Exception) as exception_info:
@ -61,4 +64,5 @@ def test_scheduler_plugins_v1(monkeypatch):
            engine.add_request("0", "foo", sampling_params)
            engine.step()
-    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(
            exception_info.value) == "Exception raised by DummyV1Scheduler"
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@ -4,25 +4,29 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 from __future__ import annotations
 import pytest
 from tests.conftest import VllmRunner
 from tests.core.utils import SchedulerProxy, create_dummy_prompt
 from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    This module relies on V0 internals, so set VLLM_USE_V1=0.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
 MODELS = [
@ -56,7 +60,7 @@ def test_mixed_requests(
    cached_position: int,
    enable_chunked_prefill: bool,
    block_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Test the case when some sequences have the prefix cache hit
@ -67,7 +71,8 @@ def test_mixed_requests(
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, backend)
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@ -81,11 +86,14 @@ def test_mixed_requests(
                block_size=block_size,
        ) as vllm_model:
            # Run the first prompt so the cache is populated
-        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
                                                      max_tokens)
            # Run all the promopts
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+            greedy_params = SamplingParams(temperature=0.0,
-        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
+                                           max_tokens=max_tokens)
            req_outputs = vllm_model.model.generate(example_prompts,
                                                    greedy_params)
            # Verify number of cached tokens
            for i in range(len(req_outputs)):
@ -95,8 +103,8 @@ def test_mixed_requests(
                        block_size) * block_size
                else:
                    expected_num_cached_tokens = 0
-            assert (
+                assert (req_outputs[i].num_cached_tokens ==
-                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
+                        expected_num_cached_tokens)
            vllm_outputs = [(
                output.prompt_token_ids + list(output.outputs[0].token_ids),
@ -115,14 +123,15 @@ def test_mixed_requests(
 def test_unstable_prompt_sequence(
    vllm_runner,
    backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, backend)
        with vllm_runner(
                "Qwen/Qwen2.5-0.5B-Instruct",
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@ -56,12 +56,11 @@ def test_gc():
    assert allocated < 50 * 1024 * 1024
-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
+    with monkeypatch.context() as m:
-    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
-    try:
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
        llm = LLM(model=MODELSCOPE_MODEL_NAME)
        prompts = [
            "Hello, my name is",
@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
        outputs = llm.generate(prompts, sampling_params)
        assert len(outputs) == 4
    finally:
        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 import asyncio
 import os
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@ -112,8 +112,9 @@ def test_deprecate_kwargs_additional_message():
        dummy(old_arg=1)
-def test_get_open_port():
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_PORT"] = "5678"
+    with monkeypatch.context() as m:
        m.setenv("VLLM_PORT", "5678")
        # make sure we can get multiple ports, even if the env var is set
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
            s1.bind(("localhost", get_open_port()))
@ -121,7 +122,6 @@ def test_get_open_port():
                s2.bind(("localhost", get_open_port()))
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                    s3.bind(("localhost", get_open_port()))
    os.environ.pop("VLLM_PORT")
 # Tests for FlexibleArgumentParser
@ -366,9 +366,10 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
        from vllm.attention import Attention, AttentionType
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
+import pytest
 from vllm.config import CompilationLevel
@ -9,10 +9,11 @@ from ..utils import compare_two_settings
 # --enforce-eager on TPU causes graph compilation
 # this times out default Health Check in the MQLLMEngine,
 # so we set the timeout here to 30s
 os.environ["VLLM_RPC_TIMEOUT"] = "30000"
-def test_custom_dispatcher():
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_RPC_TIMEOUT", "30000")
        compare_two_settings(
            "google/gemma-2b",
            arg1=[
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 # type: ignore
 from __future__ import annotations
 import os
 import threading
 from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal
 import grpc
 import pytest
@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
@pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
    """Fixture to set up a fake gRPC trace service"""
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    service = FakeTraceService()
@ -80,12 +84,18 @@ def trace_service():
    server.stop(None)
-def test_traces(trace_service):
+def test_traces(
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+    monkeypatch: pytest.MonkeyPatch,
    trace_service: FakeTraceService,
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-    sampling_params = SamplingParams(temperature=0.01,
+        sampling_params = SamplingParams(
            temperature=0.01,
            top_p=0.1,
-                                     max_tokens=256)
+            max_tokens=256,
        )
        model = "facebook/opt-125m"
        llm = LLM(
            model=model,
@ -120,17 +130,19 @@ def test_traces(trace_service):
                              ) == sampling_params.temperature
        assert attributes.get(
            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
                              ) == sampling_params.max_tokens
        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
                outputs[0].prompt_token_ids)
        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
        assert attributes.get(
            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
        metrics = outputs[0].metrics
-    assert attributes.get(
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+                              ) == metrics.time_in_queue
        ttft = metrics.first_token_time - metrics.arrival_time
        assert attributes.get(
            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
@ -145,12 +157,18 @@ def test_traces(trace_service):
        assert metrics.model_execute_time is None
-def test_traces_with_detailed_steps(trace_service):
+def test_traces_with_detailed_steps(
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+    monkeypatch: pytest.MonkeyPatch,
    trace_service: FakeTraceService,
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-    sampling_params = SamplingParams(temperature=0.01,
+        sampling_params = SamplingParams(
            temperature=0.01,
            top_p=0.1,
-                                     max_tokens=256)
+            max_tokens=256,
        )
        model = "facebook/opt-125m"
        llm = LLM(
            model=model,
@ -186,17 +204,19 @@ def test_traces_with_detailed_steps(trace_service):
                              ) == sampling_params.temperature
        assert attributes.get(
            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
                              ) == sampling_params.max_tokens
        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
                outputs[0].prompt_token_ids)
        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
        assert attributes.get(
            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
        metrics = outputs[0].metrics
-    assert attributes.get(
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+                              ) == metrics.time_in_queue
        ttft = metrics.first_token_time - metrics.arrival_time
        assert attributes.get(
            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
@ -207,9 +227,10 @@ def test_traces_with_detailed_steps(trace_service):
                              ) == metrics.scheduler_time
        assert metrics.model_forward_time > 0
        assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
-            metrics.model_forward_time / 1000)
+        ) == pytest.approx(metrics.model_forward_time / 1000)
        assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        assert attributes.get(
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
        ) == metrics.model_execute_time
        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
--- a/tests/utils.py
+++ b/tests/utils.py
@ -566,6 +566,7 @@ def init_test_distributed_environment(
 def multi_process_parallel(
    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    test_target: Any,
@ -582,7 +583,13 @@ def multi_process_parallel(
    refs = []
    for rank in range(tp_size * pp_size):
        refs.append(
-            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+            test_target.remote(
                monkeypatch,
                tp_size,
                pp_size,
                rank,
                distributed_init_port,
            ), )
    ray.get(refs)
    ray.shutdown()
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import random
 from typing import Any
 import pytest
@ -50,8 +53,12 @@ def model_name():
    return "meta-llama/Meta-Llama-3-8B-Instruct"
-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
+def test_ngram_correctness(
-                           model_name):
+    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
    model_name: str,
 ):
    '''
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
+async def test_load(
-                    engine_args_and_prompt: tuple[AsyncEngineArgs,
+    monkeypatch: pytest.MonkeyPatch,
-                                                  PromptType]):
+    output_kind: RequestOutputKind,
    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
 ):
    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
    # so that in the future when we switch, we don't have to change all the
    # tests.
@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
                     output_kind: RequestOutputKind,
                     engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                   PromptType]):
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
@fork_new_process_for_each_test
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -159,7 +159,7 @@ def test_engine_core(monkeypatch):
@fork_new_process_for_each_test
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
    """
    A basic end-to-end test to verify that the engine functions correctly
    when additional sampling parameters, such as top_p, min_tokens, and
@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
@fork_new_process_for_each_test
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
    """
    Test that the engine can handle multiple concurrent batches.
    """
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
@fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
                            multiprocessing_mode: bool):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@pytest.mark.asyncio(loop_scope="function")
-async def test_engine_core_client_asyncio(monkeypatch):
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@ -255,12 +255,10 @@ def _run_and_validate(
                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
@pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-    hf_model,
+        hf_model, vllm_model,
    vllm_model,
        batch_logprobs_composition: BatchLogprobsComposition,
-    temperature: float,
+        temperature: float, example_prompts: list[str],
-    example_prompts,
+        monkeypatch: pytest.MonkeyPatch) -> None:
 ) -> None:
    """Test V1 Engine logprobs & prompt logprobs
    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
@ -287,6 +285,8 @@ def test_get_logprobs_and_prompt_logprobs(
      temperature: "temperature" sampling parameter
      example_prompts: example prompt fixture
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
        if do_apc and (temperature < 2.0
                       or batch_logprobs_composition != SAMPLE_PROMPT):
@ -306,7 +306,8 @@ def test_get_logprobs_and_prompt_logprobs(
        # Batch has mixed sample params
        # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+        logprob_prompt_logprob_list = get_test_batch(
            batch_logprobs_composition)
        # Ensure that each test prompt has a logprob config for testing
        logprob_prompt_logprob_list = _repeat_logprob_config(
@ -333,16 +334,13 @@ def test_get_logprobs_and_prompt_logprobs(
                do_apc=do_apc)
-def test_max_logprobs():
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    Should also fail for `prompt_logprobs > max_logprobs`
    APC should not matter as this test checks basic request validation.
    Args:
      monkeypatch
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        runner = VllmRunner("facebook/opt-125m",
                            max_logprobs=1,
@ -354,40 +352,52 @@ def test_max_logprobs():
        bad_sampling_params = SamplingParams(logprobs=2)
        with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+            runner.generate(["Hello world"],
                            sampling_params=bad_sampling_params)
-def test_none_logprobs(vllm_model, example_prompts):
+def test_none_logprobs(vllm_model, example_prompts,
                       monkeypatch: pytest.MonkeyPatch):
    """Engine should return `logprobs` and `prompt_logprobs` as `None`
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        max_tokens = 5
-    sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+        sampling_params_logprobs_none = SamplingParams(
            max_tokens=max_tokens,
            logprobs=None,
            prompt_logprobs=None,
-                                                   temperature=0.0)
+            temperature=0.0,
        )
        results_logprobs_none = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_none)
+            example_prompts,
            sampling_params=sampling_params_logprobs_none,
        )
        for i in range(len(results_logprobs_none)):
            # Check sample logprobs are None
            assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+            assert results_logprobs_none[i].outputs[
                0].cumulative_logprob is None
            # Check prompt logprobs are None
            assert results_logprobs_none[i].prompt_logprobs is None
-def test_zero_logprobs(vllm_model, example_prompts):
+def test_zero_logprobs(vllm_model, example_prompts,
                       monkeypatch: pytest.MonkeyPatch):
    """Engine should return sampled token and prompt token logprobs
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        max_tokens = 5
        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@ -3,11 +3,16 @@
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from vllm.platforms import current_platform
-from ...conftest import VllmRunner
+if TYPE_CHECKING:
    from tests.conftest import VllmRunner
 MODELS = [
    # "Qwen/Qwen2-7B-Instruct",
@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
-    monkeypatch,
+    vllm_runner: type[VllmRunner],
    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    enforce_eager: bool,
@ -41,7 +47,7 @@ def test_models(
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        with VllmRunner(
+        with vllm_runner(
                model,
                max_model_len=8192,
                enforce_eager=enforce_eager,