[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
2024-06-08 01:59:20 -07:00 · 2024-06-08 01:59:20 -07:00 · 8ea5e44a43
commit 8ea5e44a43
parent 9fb900f90c
28 changed files with 431 additions and 470 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -46,12 +46,11 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
                     dtype=dtype,
                     enforce_eager=enforce_eager,
-                             gpu_memory_utilization=0.7)
+                     gpu_memory_utilization=0.7) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -43,7 +43,7 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            max_num_batched_tokens=max_num_batched_tokens,
@ -51,9 +51,8 @@ def test_models(
            tensor_parallel_size=tensor_parallel_size,
            enforce_eager=enforce_eager,
            max_num_seqs=max_num_seqs,
-    )
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            max_num_batched_tokens=max_num_batched_tokens,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
-    )
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
                ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -84,17 +83,16 @@ def test_preemption(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            disable_log_stats=False,
-    )
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
                ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -139,19 +137,18 @@ def test_swap(
        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                   max_tokens)

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            swap_space=10,
            disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
                ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
@ -196,16 +193,16 @@ def test_swap_infeasible(
    decode_blocks = max_tokens // BLOCK_SIZE
    example_prompts = example_prompts[:1]

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            swap_space=10,
            block_size=BLOCK_SIZE,
-        # Since beam search have more than 1 sequence, prefill + decode blocks
-        # are not enough to finish.
+            # Since beam search have more than 1 sequence, prefill +
+            # decode blocks are not enough to finish.
            num_gpu_blocks_override=prefill_blocks + decode_blocks,
            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-    )
+    ) as vllm_model:
        sampling_params = SamplingParams(n=beam_width,
                                         use_beam_search=True,
                                         temperature=0.0,
@ -217,7 +214,7 @@ def test_swap_infeasible(
        )
        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
                ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+
    # Verify the request is ignored and not hang.
    assert req_outputs[0].outputs[0].finish_reason == "length"

@ -236,7 +233,7 @@ def test_preemption_infeasible(
    BLOCK_SIZE = 16
    prefill_blocks = 2
    decode_blocks = max_tokens // BLOCK_SIZE
-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            block_size=BLOCK_SIZE,
@ -245,8 +242,9 @@ def test_preemption_infeasible(
            # ignored instead of hanging forever.
            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-    )
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
        req_outputs = vllm_model.model.generate(
            example_prompts,
            sampling_params=sampling_params,
@ -254,7 +252,7 @@ def test_preemption_infeasible(

        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
                ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+
    # Verify the request is ignored and not hang.
    for req_output in req_outputs:
        outputs = req_output.outputs
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -493,7 +493,10 @@ class VllmRunner:
            outputs.append(embedding)
        return outputs

-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
        cleanup()

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@ -45,14 +45,13 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(
-        model,
+    with vllm_runner(model,
                     dtype=dtype,
                     tensor_parallel_size=2,
                     enforce_eager=enforce_eager,
-        distributed_executor_backend=distributed_executor_backend)
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@ -48,7 +48,7 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=2,
@ -56,9 +56,8 @@ def test_models(
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
-    )
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@ -19,9 +19,8 @@ MAX_TOKENS = 1024

@pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
        yield vllm_model
-    del vllm_model


 def test_stop_reason(vllm_model, example_prompts):
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@ -10,7 +10,8 @@ MAX_TOKENS = 200

@pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    return vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model


@pytest.mark.skip_global_cleanup
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@ -23,12 +23,14 @@ def test_metric_counter_prompt_tokens(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
                     dtype=dtype,
                     disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
+                     gpu_memory_utilization=0.4) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
-    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+        prompt_token_counts = [
+            len(tokenizer.encode(p)) for p in example_prompts
+        ]
        # This test needs at least 2 prompts in a batch of different lengths to
        # verify their token count is correct despite padding.
        assert len(example_prompts) > 1, "at least 2 prompts are required"
@ -56,10 +58,10 @@ def test_metric_counter_generation_tokens(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
                     dtype=dtype,
                     disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
+                     gpu_memory_utilization=0.4) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        tokenizer = vllm_model.model.get_tokenizer()
        stat_logger = vllm_model.model.llm_engine.stat_logger
@ -85,16 +87,14 @@ def test_metric_counter_generation_tokens(
    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                                   served_model_name: List[str]) -> None:
-    vllm_model = vllm_runner(model,
+    with vllm_runner(model,
                     dtype=dtype,
                     disable_log_stats=False,
                     gpu_memory_utilization=0.3,
-                             served_model_name=served_model_name)
+                     served_model_name=served_model_name) as vllm_model:
        stat_logger = vllm_model.model.llm_engine.stat_logger
        metrics_tag_content = stat_logger.labels["model_name"]

-    del vllm_model
-
    if served_model_name is None or served_model_name == []:
        assert metrics_tag_content == model, (
            f"Metrics tag model_name is wrong! expect: {model!r}\n"
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@ -82,10 +82,9 @@ def test_models(
    num_logprobs: int,
 ) -> None:

-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

    # loop through the prompts to compare against the ground truth generations
    for prompt_idx in range(len(example_prompts)):
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@ -37,9 +37,8 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -57,9 +56,8 @@ def test_model_print(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
        print(vllm_model.model.llm_engine.model_executor.driver_worker.
              model_runner.model)
-    del vllm_model
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@ -31,9 +31,8 @@ def test_models(
    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)
-    del vllm_model

    similarities = compare_embeddings(hf_outputs, vllm_outputs)
    all_similarities = torch.stack(similarities)
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@ -70,32 +70,29 @@ def test_models(
    model_name, revision = model

    # Run marlin.
-    gptq_marlin_model = vllm_runner(model_name=model_name,
+    with vllm_runner(model_name=model_name,
                     revision=revision,
                     dtype=dtype,
                     quantization="marlin",
                     max_model_len=MAX_MODEL_LEN,
-                                    tensor_parallel_size=1)
+                     tensor_parallel_size=1) as gptq_marlin_model:

        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)
-    del gptq_marlin_model
    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error

    # Run gptq.
    # The naive gptq kernel doesn't support bf16 yet.
    # Here we always compare fp16/bf16 gpt marlin kernel
    # to fp16 gptq kernel.
-    gptq_model = vllm_runner(model_name=model_name,
+    with vllm_runner(model_name=model_name,
                     revision=revision,
                     dtype="half",
                     quantization="gptq",
                     max_model_len=MAX_MODEL_LEN,
-                             tensor_parallel_size=1)
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+                     tensor_parallel_size=1) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@ -61,20 +61,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    marlin_24_model = vllm_runner(model_pair.model_marlin,
+    with vllm_runner(model_pair.model_marlin,
                     dtype=dtype,
-                                  quantization="gptq_marlin_24")
+                     quantization="gptq_marlin_24") as marlin_24_model:
        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
-    del marlin_24_model

-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
        for p in HF_IMAGE_PROMPTS
    ]

-    vllm_model = vllm_runner(model_id,
+    with vllm_runner(model_id,
                     dtype=dtype,
                     enforce_eager=True,
-                             **vlm_config.as_cli_args_dict())
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                  max_tokens,
                                                  images=vllm_images)
-    del vllm_model

    for i in range(len(HF_IMAGE_PROMPTS)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@ -59,20 +59,16 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin,
+    with vllm_runner(model_pair.model_marlin,
                     dtype=dtype,
-                               quantization="marlin")
+                     quantization="marlin") as marlin_model:
        marlin_outputs = marlin_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
-    del marlin_model

-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@ -30,11 +30,9 @@ def test_models(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@ -37,9 +37,8 @@ def test_models(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -57,9 +56,8 @@ def test_model_print(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
        print(vllm_model.model.llm_engine.model_executor.driver_worker.
              model_runner.model)
-    del vllm_model
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@ -16,12 +16,12 @@ capability = capability[0] * 10 + capability[1]
    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
-    llm = vllm_runner('huggyllama/llama-7b',
+    with vllm_runner('huggyllama/llama-7b',
                     quantization='bitsandbytes',
                     load_format='bitsandbytes',
-                      enforce_eager=True)
+                     enforce_eager=True) as llm:

-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501

        # check the weights in MLP & SelfAttention are quantized to torch.uint8
        qweight = model.model.layers[0].mlp.gate_up_proj.qweight
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -12,8 +12,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso

 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
    model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
-    llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    with vllm_runner(model_path, quantization="sparseml",
+                     enforce_eager=True) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
@ -23,8 +24,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(gate_up_proj.quant_method,
+                          CompressedTensorsLinearMethod)
+        assert isinstance(down_proj.quant_method,
+                          CompressedTensorsLinearMethod)

        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)

@ -39,11 +42,11 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):

 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
    model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
-    llm = vllm_runner(model_path,
+    with vllm_runner(model_path,
                     quantization="sparseml",
                     enforce_eager=True,
-                      dtype=torch.float16)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+                     dtype=torch.float16) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:

-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        fc1 = model.model.decoder.layers[0].fc1
        assert isinstance(fc1.quant_method, Fp8LinearMethod)
        assert fc1.weight.dtype == torch.float8_e4m3fn
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@ -2,10 +2,8 @@

 Run `pytest tests/samplers/test_beam_search.py`.
 """
-import gc

 import pytest
-import torch

 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@ -34,14 +32,9 @@ def test_beam_search_single_input(
        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                   max_tokens)

-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    del vllm_model
-    # NOTE(woosuk): For some reason, the following GC is required to avoid
-    # GPU OOM errors in the following tests using `vllm_runner`.
-    gc.collect()
-    torch.cuda.empty_cache()
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@ -22,8 +22,9 @@ def test_ignore_eos(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)

        for prompt in example_prompts:
            ignore_eos_output = vllm_model.model.generate(
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@ -14,7 +14,7 @@ def test_logits_processor_force_generate(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
        repeat_times = 2
        enforced_answers = " vLLM"
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@ -38,14 +38,14 @@ def test_get_prompt_logprobs(
            max_tokens=max_tokens,
        )

-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            max_logprobs=num_top_logprobs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            max_num_seqs=max_num_seqs,
-    )
+    ) as vllm_model:
        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                              logprobs=num_top_logprobs,
                                              prompt_logprobs=num_top_logprobs,
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@ -17,23 +17,18 @@ def test_ranks(
    num_top_logprobs = 5
    num_prompt_logprobs = 5

-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:

        ## Test greedy logprobs ranks
-    vllm_sampling_params = SamplingParams(temperature=0.0,
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
            top_p=1.0,
            max_tokens=max_tokens,
            logprobs=num_top_logprobs,
            prompt_logprobs=num_prompt_logprobs)
        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
                                                      vllm_sampling_params)
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1

        ## Test non-greedy logprobs ranks
        sampling_params = SamplingParams(temperature=1.0,
@ -42,6 +37,15 @@ def test_ranks(
                                         logprobs=num_top_logprobs,
                                         prompt_logprobs=num_prompt_logprobs)
        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    for result in vllm_results:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks = 1
+        for token, logprobs in zip(result[0], result[2]):
+            assert token in logprobs
+            assert logprobs[token].rank == 1
+
    for result in res:
        assert result[2] is not None
        assert len(result[2]) == len(result[0])
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5))

@pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL, dtype="half")
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
        yield vllm_model
-    del vllm_model


@pytest.mark.parametrize("seed", RANDOM_SEEDS)
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@ -1,4 +1,3 @@
-import gc
 import json
 import os
 import subprocess
@ -7,7 +6,6 @@ from unittest.mock import MagicMock, patch
 import openai
 import pytest
 import ray
-import torch

 from vllm import SamplingParams
 # yapf: disable
@ -71,15 +69,15 @@ def test_can_deserialize_s3(vllm_runner):
    model_ref = "EleutherAI/pythia-1.4b"
    tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"

-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=tensorized_path,
                                      num_readers=1,
                                      s3_endpoint="object.ord1.coreweave.com",
-                                  ))
+                                  )) as loaded_hf_model:

-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501

        assert deserialized_outputs

@ -87,7 +85,7 @@ def test_can_deserialize_s3(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
        vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
+    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
        outputs = vllm_model.generate(prompts, sampling_params)
@ -97,19 +95,15 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
                            config_for_serializing,
                            encryption_key_path=key_path)

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-
    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)

-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
        model_ref,
        load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing)
+        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501

-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501

        assert outputs == deserialized_outputs

@ -124,12 +118,12 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
            serializer = TensorSerializer(stream)
            serializer.write_module(hf_model.model)

-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=model_path,
                                      num_readers=1,
-                                  ))
+                                  )) as loaded_hf_model:

        deserialized_outputs = loaded_hf_model.generate_greedy(
            prompts, max_tokens=max_tokens)
@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    test_prompts = create_test_prompts(lora_path)

    # Serialize model before deserializing and binding LoRA adapters
-    vllm_model = vllm_runner(model_ref, )
+    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")

        serialize_vllm_model(vllm_model.model.llm_engine,
                            TensorizerConfig(tensorizer_uri=model_path))

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
        model_ref,
        load_format="tensorizer",
        model_loader_extra_config=TensorizerConfig(
@ -170,7 +161,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        max_cpu_loras=2,
        max_num_seqs=50,
        max_model_len=1000,
-    )
+    ) as loaded_vllm_model:
        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)

        assert loaded_vllm_model
@ -186,7 +177,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    ## Serialize model
-    vllm_model = vllm_runner(model_ref, )
+    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")

        serialize_vllm_model(vllm_model.model.llm_engine,
@ -196,10 +187,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
            "tensorizer_uri": str(model_path),
        }

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-
    ## Start OpenAI API server
    openai_args = [
        "--model", model_ref, "--dtype", "float16", "--load-format",
@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    model_path = tmp_path / (model_ref + ".tensors")
    config = TensorizerConfig(tensorizer_uri=str(model_path))

-    vllm_model = vllm_runner(model_ref)
+    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
        serialize_vllm_model(vllm_model.model.llm_engine, config)

        assert is_vllm_tensorized(config)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()

-    loaded_vllm_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                    load_format="tensorizer",
-                                    model_loader_extra_config=config)
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+                    model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501

        assert outputs == deserialized_outputs