From 9fb900f90cbb5614c3e7d67446325ad8b7ac04b2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 7 Jun 2024 22:31:32 -0700 Subject: [PATCH] [CI/Test] improve robustness of test (hf_runner) (#5347) [CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347) --- .../test_basic_correctness.py | 5 ++--- .../basic_correctness/test_chunked_prefill.py | 5 ++--- tests/basic_correctness/test_preemption.py | 17 +++++++---------- tests/conftest.py | 5 ++++- .../test_basic_distributed_correctness.py | 5 ++--- .../test_chunked_prefill_distributed.py | 5 ++--- tests/models/test_big_models.py | 5 ++--- tests/models/test_embedding.py | 5 ++--- tests/models/test_llava.py | 9 ++++----- tests/models/test_mistral.py | 7 +++---- tests/models/test_models.py | 5 ++--- tests/samplers/test_beam_search.py | 7 +++---- tests/samplers/test_logprobs.py | 11 +++++------ tests/tensorizer_loader/test_tensorizer.py | 18 ++++++++---------- 14 files changed, 48 insertions(+), 61 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 7d811744..4561c8b1 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -43,9 +43,8 @@ def test_models( if backend_by_env_var == "FLASHINFER" and enforce_eager is False: pytest.skip("Skipping non-eager test for FlashInferBackend.") - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner(model, dtype=dtype, diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 47d582c7..8f8494f3 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -40,9 +40,8 @@ def test_models( enable_chunked_prefill = True max_num_batched_tokens = chunked_prefill_token_size - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner( model, diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 29a4c39c..58610e9e 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -43,9 +43,8 @@ def test_chunked_prefill_recompute( enable_chunked_prefill = True max_num_batched_tokens = chunked_prefill_token_size - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner( model, @@ -82,9 +81,8 @@ def test_preemption( ) -> None: """By default, recompute preemption is enabled""" - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner( model, @@ -137,10 +135,9 @@ def test_swap( ) -> None: """Use beam search enables swapping.""" example_prompts = example_prompts[:1] - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, + max_tokens) vllm_model = vllm_runner( model, diff --git a/tests/conftest.py b/tests/conftest.py index 1a7037eb..5becf84a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -354,7 +354,10 @@ class HfRunner: def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: return self.model.encode(prompts) - def __del__(self): + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): del self.model cleanup() diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 3ba5cea3..23e35482 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -42,9 +42,8 @@ def test_models( backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND) enforce_eager = backend_by_env_var == "FLASHINFER" - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner( model, diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index db938cc6..9af48831 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -45,9 +45,8 @@ def test_models( enable_chunked_prefill = True max_num_batched_tokens = chunked_prefill_token_size - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner( model, diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index ea95e6a4..4af9824e 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -34,9 +34,8 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 668ed3a5..8ad9ac2d 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -28,9 +28,8 @@ def test_models( model: str, dtype: str, ) -> None: - hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True) - hf_outputs = hf_model.encode(example_prompts) - del hf_model + with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model: + hf_outputs = hf_model.encode(example_prompts) vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.encode(example_prompts) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index f03dbdbb..1e7ee528 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -84,11 +84,10 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, """ model_id, vlm_config = model_and_config - hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True) - hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, - max_tokens, - images=hf_images) - del hf_model + with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images) vllm_image_prompts = [ p.replace("", "" * vlm_config.image_feature_size) diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 76b248cf..178d23d3 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -26,10 +26,9 @@ def test_models( num_logprobs: int, ) -> None: # TODO(sang): Sliding window should be tested separately. - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, diff --git a/tests/models/test_models.py b/tests/models/test_models.py index e4609620..a80ac5d9 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -34,9 +34,8 @@ def test_models( # To pass the small model tests, we need full precision. assert dtype == "float" - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 2682f284..2e373cb8 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -30,10 +30,9 @@ def test_beam_search_single_input( beam_width: int, ) -> None: example_prompts = example_prompts[:1] - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, + max_tokens) vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 61720ccc..25d59391 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -32,12 +32,11 @@ def test_get_prompt_logprobs( max_num_batched_tokens = chunked_prefill_token_size max_tokens = 5 - hf_model = hf_runner(model, dtype=dtype) - hf_logprobs = hf_model.generate_greedy_logprobs( - example_prompts, - max_tokens=max_tokens, - ) - del hf_model + with hf_runner(model, dtype=dtype) as hf_model: + hf_logprobs = hf_model.generate_greedy_logprobs( + example_prompts, + max_tokens=max_tokens, + ) vllm_model = vllm_runner( model, diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 1579d53a..648de4db 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -116,16 +116,14 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, tmp_path): - hf_model = hf_runner(model_ref) - model_path = tmp_path / (model_ref + ".tensors") - max_tokens = 50 - outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens) - with open_stream(model_path, "wb+") as stream: - serializer = TensorSerializer(stream) - serializer.write_module(hf_model.model) - del hf_model - gc.collect() - torch.cuda.empty_cache() + with hf_runner(model_ref) as hf_model: + model_path = tmp_path / (model_ref + ".tensors") + max_tokens = 50 + outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens) + with open_stream(model_path, "wb+") as stream: + serializer = TensorSerializer(stream) + serializer.write_module(hf_model.model) + loaded_hf_model = vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=TensorizerConfig(