From 8ea5e44a435e8731fd6f5ba4c329dd112752532a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 8 Jun 2024 01:59:20 -0700 Subject: [PATCH] [CI/Test] improve robustness of test (vllm_runner) (#5357) [CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357) --- .../test_basic_correctness.py | 11 +- .../basic_correctness/test_chunked_prefill.py | 21 ++- tests/basic_correctness/test_preemption.py | 148 +++++++++--------- tests/conftest.py | 5 +- .../test_basic_distributed_correctness.py | 15 +- .../test_chunked_prefill_distributed.py | 21 ++- tests/engine/test_stop_reason.py | 5 +- tests/engine/test_stop_strings.py | 3 +- tests/metrics/test_metrics.py | 82 +++++----- tests/models/test_aqlm.py | 7 +- tests/models/test_big_models.py | 16 +- tests/models/test_embedding.py | 5 +- tests/models/test_gptq_marlin.py | 35 ++--- tests/models/test_gptq_marlin_24.py | 22 ++- tests/models/test_llava.py | 15 +- tests/models/test_marlin.py | 22 ++- tests/models/test_mistral.py | 8 +- tests/models/test_models.py | 16 +- tests/quantization/test_bitsandbytes.py | 98 ++++++------ tests/quantization/test_compressed_tensors.py | 59 +++---- tests/quantization/test_fp8.py | 10 +- tests/samplers/test_beam_search.py | 13 +- tests/samplers/test_ignore_eos.py | 15 +- tests/samplers/test_logits_processor.py | 72 ++++----- tests/samplers/test_logprobs.py | 30 ++-- tests/samplers/test_ranks.py | 36 +++-- tests/samplers/test_seeded_generate.py | 5 +- tests/tensorizer_loader/test_tensorizer.py | 106 ++++++------- 28 files changed, 431 insertions(+), 470 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 4561c8b1..805b8883 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -46,12 +46,11 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner(model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner(model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 8f8494f3..48d60912 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -43,17 +43,16 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 58610e9e..7f20b2d9 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -46,17 +46,16 @@ def test_chunked_prefill_recompute( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - max_num_seqs=max_num_seqs, - ) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < - ARTIFICIAL_PREEMPTION_MAX_CNT) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < + ARTIFICIAL_PREEMPTION_MAX_CNT) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -84,17 +83,16 @@ def test_preemption( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - ) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < - ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + disable_log_stats=False, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < + ARTIFICIAL_PREEMPTION_MAX_CNT) + total_preemption = ( + vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -139,19 +137,18 @@ def test_swap( hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - swap_space=10, - disable_log_stats=False, - ) - vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < - ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + swap_space=10, + disable_log_stats=False, + ) as vllm_model: + vllm_outputs = vllm_model.generate_beam_search(example_prompts, + beam_width, max_tokens) + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < + ARTIFICIAL_PREEMPTION_MAX_CNT) + total_preemption = ( + vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) for i in range(len(example_prompts)): hf_output_ids, _ = hf_outputs[i] @@ -196,28 +193,28 @@ def test_swap_infeasible( decode_blocks = max_tokens // BLOCK_SIZE example_prompts = example_prompts[:1] - vllm_model = vllm_runner( - model, - dtype=dtype, - swap_space=10, - block_size=BLOCK_SIZE, - # Since beam search have more than 1 sequence, prefill + decode blocks - # are not enough to finish. - num_gpu_blocks_override=prefill_blocks + decode_blocks, - max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, - ) - sampling_params = SamplingParams(n=beam_width, - use_beam_search=True, - temperature=0.0, - max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.model.generate( - example_prompts, - sampling_params=sampling_params, - ) - assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < - ARTIFICIAL_PREEMPTION_MAX_CNT) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + swap_space=10, + block_size=BLOCK_SIZE, + # Since beam search have more than 1 sequence, prefill + + # decode blocks are not enough to finish. + num_gpu_blocks_override=prefill_blocks + decode_blocks, + max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, + ) as vllm_model: + sampling_params = SamplingParams(n=beam_width, + use_beam_search=True, + temperature=0.0, + max_tokens=max_tokens, + ignore_eos=True) + req_outputs = vllm_model.model.generate( + example_prompts, + sampling_params=sampling_params, + ) + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < + ARTIFICIAL_PREEMPTION_MAX_CNT) + # Verify the request is ignored and not hang. assert req_outputs[0].outputs[0].finish_reason == "length" @@ -236,25 +233,26 @@ def test_preemption_infeasible( BLOCK_SIZE = 16 prefill_blocks = 2 decode_blocks = max_tokens // BLOCK_SIZE - vllm_model = vllm_runner( - model, - dtype=dtype, - block_size=BLOCK_SIZE, - # Not enough gpu blocks to complete a single sequence. - # preemption should happen, and the sequence should be - # ignored instead of hanging forever. - num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, - max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - ) - sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) - req_outputs = vllm_model.model.generate( - example_prompts, - sampling_params=sampling_params, - ) + with vllm_runner( + model, + dtype=dtype, + block_size=BLOCK_SIZE, + # Not enough gpu blocks to complete a single sequence. + # preemption should happen, and the sequence should be + # ignored instead of hanging forever. + num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, + max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), + ) as vllm_model: + sampling_params = SamplingParams(max_tokens=max_tokens, + ignore_eos=True) + req_outputs = vllm_model.model.generate( + example_prompts, + sampling_params=sampling_params, + ) + + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < + ARTIFICIAL_PREEMPTION_MAX_CNT) - assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < - ARTIFICIAL_PREEMPTION_MAX_CNT) - del vllm_model # Verify the request is ignored and not hang. for req_output in req_outputs: outputs = req_output.outputs diff --git a/tests/conftest.py b/tests/conftest.py index 5becf84a..e0680467 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -493,7 +493,10 @@ class VllmRunner: outputs.append(embedding) return outputs - def __del__(self): + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): del self.model cleanup() diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 23e35482..eb423aef 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -45,14 +45,13 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - enforce_eager=enforce_eager, - distributed_executor_backend=distributed_executor_backend) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=2, + enforce_eager=enforce_eager, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 9af48831..4e4e468c 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -48,17 +48,16 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + max_num_seqs=max_num_seqs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index 7b886507..b0bd6c4a 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -19,9 +19,8 @@ MAX_TOKENS = 1024 @pytest.fixture def vllm_model(vllm_runner): - vllm_model = vllm_runner(MODEL) - yield vllm_model - del vllm_model + with vllm_runner(MODEL) as vllm_model: + yield vllm_model def test_stop_reason(vllm_model, example_prompts): diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 6b747beb..1584b85a 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -10,7 +10,8 @@ MAX_TOKENS = 200 @pytest.fixture(scope="session") def vllm_model(vllm_runner): - return vllm_runner(MODEL) + with vllm_runner(MODEL) as vllm_model: + yield vllm_model @pytest.mark.skip_global_cleanup diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e0aa14f1..c1164739 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens( dtype: str, max_tokens: int, ) -> None: - vllm_model = vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) - tokenizer = vllm_model.model.get_tokenizer() - prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] - # This test needs at least 2 prompts in a batch of different lengths to - # verify their token count is correct despite padding. - assert len(example_prompts) > 1, "at least 2 prompts are required" - assert prompt_token_counts[0] != prompt_token_counts[1], ( - "prompts of different lengths are required") - vllm_prompt_token_count = sum(prompt_token_counts) + with vllm_runner(model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + prompt_token_counts = [ + len(tokenizer.encode(p)) for p in example_prompts + ] + # This test needs at least 2 prompts in a batch of different lengths to + # verify their token count is correct despite padding. + assert len(example_prompts) > 1, "at least 2 prompts are required" + assert prompt_token_counts[0] != prompt_token_counts[1], ( + "prompts of different lengths are required") + vllm_prompt_token_count = sum(prompt_token_counts) - _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.model.llm_engine.stat_logger - metric_count = stat_logger.metrics.counter_prompt_tokens.labels( - **stat_logger.labels)._value.get() + _ = vllm_model.generate_greedy(example_prompts, max_tokens) + stat_logger = vllm_model.model.llm_engine.stat_logger + metric_count = stat_logger.metrics.counter_prompt_tokens.labels( + **stat_logger.labels)._value.get() assert vllm_prompt_token_count == metric_count, ( f"prompt token count: {vllm_prompt_token_count!r}\n" @@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens( dtype: str, max_tokens: int, ) -> None: - vllm_model = vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_logger - metric_count = stat_logger.metrics.counter_generation_tokens.labels( - **stat_logger.labels)._value.get() - vllm_generation_count = 0 - for i in range(len(example_prompts)): - vllm_output_ids, vllm_output_str = vllm_outputs[i] - prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. - # We're interested only in the count of the generation tokens. - vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + with vllm_runner(model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + stat_logger = vllm_model.model.llm_engine.stat_logger + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) assert vllm_generation_count == metric_count, ( f"generation token count: {vllm_generation_count!r}\n" @@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens( [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, served_model_name: List[str]) -> None: - vllm_model = vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.3, - served_model_name=served_model_name) - stat_logger = vllm_model.model.llm_engine.stat_logger - metrics_tag_content = stat_logger.labels["model_name"] - - del vllm_model + with vllm_runner(model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.3, + served_model_name=served_model_name) as vllm_model: + stat_logger = vllm_model.model.llm_engine.stat_logger + metrics_tag_content = stat_logger.labels["model_name"] if served_model_name is None or served_model_name == []: assert metrics_tag_content == model, ( diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 85d74f7f..c4ecf846 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -82,10 +82,9 @@ def test_models( num_logprobs: int, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, - max_tokens, - num_logprobs) + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) # loop through the prompts to compare against the ground truth generations for prompt_idx in range(len(example_prompts)): diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 4af9824e..ef782837 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -37,9 +37,8 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -57,9 +56,8 @@ def test_model_print( model: str, dtype: str, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py index 8ad9ac2d..6556998b 100644 --- a/tests/models/test_embedding.py +++ b/tests/models/test_embedding.py @@ -31,9 +31,8 @@ def test_models( with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.encode(example_prompts) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) similarities = compare_embeddings(hf_outputs, vllm_outputs) all_similarities = torch.stack(similarities) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 814471b4..e957450c 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -70,32 +70,29 @@ def test_models( model_name, revision = model # Run marlin. - gptq_marlin_model = vllm_runner(model_name=model_name, - revision=revision, - dtype=dtype, - quantization="marlin", - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) + with vllm_runner(model_name=model_name, + revision=revision, + dtype=dtype, + quantization="marlin", + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=1) as gptq_marlin_model: - gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( - example_prompts[:-1], max_tokens, num_logprobs) - del gptq_marlin_model + gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( + example_prompts[:-1], max_tokens, num_logprobs) _ROPE_DICT.clear() # clear rope cache to avoid rope dtype error # Run gptq. # The naive gptq kernel doesn't support bf16 yet. # Here we always compare fp16/bf16 gpt marlin kernel # to fp16 gptq kernel. - gptq_model = vllm_runner(model_name=model_name, - revision=revision, - dtype="half", - quantization="gptq", - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) - gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1], - max_tokens, - num_logprobs) - del gptq_model + with vllm_runner(model_name=model_name, + revision=revision, + dtype="half", + quantization="gptq", + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=1) as gptq_model: + gptq_outputs = gptq_model.generate_greedy_logprobs( + example_prompts[:-1], max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=gptq_outputs, diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index cc35ee80..195c3e5b 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -61,20 +61,16 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - marlin_24_model = vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="gptq_marlin_24") - marlin_24_outputs = marlin_24_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del marlin_24_model + with vllm_runner(model_pair.model_marlin, + dtype=dtype, + quantization="gptq_marlin_24") as marlin_24_model: + marlin_24_outputs = marlin_24_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - gptq_model = vllm_runner(model_pair.model_gptq, - dtype=dtype, - quantization="gptq") - gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, - max_tokens, - num_logprobs) - del gptq_model + with vllm_runner(model_pair.model_gptq, dtype=dtype, + quantization="gptq") as gptq_model: + gptq_outputs = gptq_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=gptq_outputs, diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 1e7ee528..1f446362 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, for p in HF_IMAGE_PROMPTS ] - vllm_model = vllm_runner(model_id, - dtype=dtype, - enforce_eager=True, - **vlm_config.as_cli_args_dict()) - vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_images) - del vllm_model + with vllm_runner(model_id, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) for i in range(len(HF_IMAGE_PROMPTS)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 8520b267..761ba6aa 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -59,20 +59,16 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - marlin_model = vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - del marlin_model + with vllm_runner(model_pair.model_marlin, + dtype=dtype, + quantization="marlin") as marlin_model: + marlin_outputs = marlin_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - gptq_model = vllm_runner(model_pair.model_gptq, - dtype=dtype, - quantization="gptq") - gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, - max_tokens, - num_logprobs) - del gptq_model + with vllm_runner(model_pair.model_gptq, dtype=dtype, + quantization="gptq") as gptq_model: + gptq_outputs = gptq_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=gptq_outputs, diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 178d23d3..6acc057f 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -30,11 +30,9 @@ def test_models( hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, - max_tokens, - num_logprobs) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, diff --git a/tests/models/test_models.py b/tests/models/test_models.py index a80ac5d9..71238d69 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -37,9 +37,8 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -57,9 +56,8 @@ def test_model_print( model: str, dtype: str, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) - del vllm_model + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 4e9feb3c..31e938d1 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -16,65 +16,65 @@ capability = capability[0] * 10 + capability[1] capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), reason='bitsandbytes is not supported on this GPU type.') def test_load_bnb_model(vllm_runner) -> None: - llm = vllm_runner('huggyllama/llama-7b', - quantization='bitsandbytes', - load_format='bitsandbytes', - enforce_eager=True) + with vllm_runner('huggyllama/llama-7b', + quantization='bitsandbytes', + load_format='bitsandbytes', + enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - # check the weights in MLP & SelfAttention are quantized to torch.uint8 - qweight = model.model.layers[0].mlp.gate_up_proj.qweight - assert qweight.dtype == torch.uint8, ( - f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}') + # check the weights in MLP & SelfAttention are quantized to torch.uint8 + qweight = model.model.layers[0].mlp.gate_up_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}') - qweight = model.model.layers[0].mlp.down_proj.qweight - assert qweight.dtype == torch.uint8, ( - f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}') + qweight = model.model.layers[0].mlp.down_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}') - qweight = model.model.layers[0].self_attn.o_proj.qweight - assert qweight.dtype == torch.uint8, ( - f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}') + qweight = model.model.layers[0].self_attn.o_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}') - qweight = model.model.layers[0].self_attn.qkv_proj.qweight - assert qweight.dtype == torch.uint8, ( - f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}') + qweight = model.model.layers[0].self_attn.qkv_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}') - # some weights should not be quantized - weight = model.lm_head.weight - assert weight.dtype != torch.uint8, ( - 'lm_head weight dtype should not be torch.uint8') + # some weights should not be quantized + weight = model.lm_head.weight + assert weight.dtype != torch.uint8, ( + 'lm_head weight dtype should not be torch.uint8') - weight = model.model.embed_tokens.weight - assert weight.dtype != torch.uint8, ( - 'embed_tokens weight dtype should not be torch.uint8') + weight = model.model.embed_tokens.weight + assert weight.dtype != torch.uint8, ( + 'embed_tokens weight dtype should not be torch.uint8') - weight = model.model.layers[0].input_layernorm.weight - assert weight.dtype != torch.uint8, ( - 'input_layernorm weight dtype should not be torch.uint8') + weight = model.model.layers[0].input_layernorm.weight + assert weight.dtype != torch.uint8, ( + 'input_layernorm weight dtype should not be torch.uint8') - weight = model.model.layers[0].post_attention_layernorm.weight - assert weight.dtype != torch.uint8, ( - 'input_layernorm weight dtype should not be torch.uint8') + weight = model.model.layers[0].post_attention_layernorm.weight + assert weight.dtype != torch.uint8, ( + 'input_layernorm weight dtype should not be torch.uint8') - # check the output of the model is expected - sampling_params = SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=8) + # check the output of the model is expected + sampling_params = SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=8) - prompts = ['That which does not kill us', 'To be or not to be,'] - expected_outputs = [ - 'That which does not kill us makes us stronger.', - 'To be or not to be, that is the question.' - ] - outputs = llm.generate(prompts, sampling_params=sampling_params) + prompts = ['That which does not kill us', 'To be or not to be,'] + expected_outputs = [ + 'That which does not kill us makes us stronger.', + 'To be or not to be, that is the question.' + ] + outputs = llm.generate(prompts, sampling_params=sampling_params) - assert len(outputs) == len(prompts) + assert len(outputs) == len(prompts) - for index in range(len(outputs)): - # compare the first line of the output - actual_output = outputs[index][1][0].split('\n', 1)[0] - expected_output = expected_outputs[index].split('\n', 1)[0] - assert actual_output == expected_output, ( - f'Expected: {expected_output}, but got: {actual_output}') + for index in range(len(outputs)): + # compare the first line of the output + actual_output = outputs[index][1][0].split('\n', 1)[0] + expected_output = expected_outputs[index].split('\n', 1)[0] + assert actual_output == expected_output, ( + f'Expected: {expected_output}, but got: {actual_output}') diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 8b48f418..9d94d2ec 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -12,42 +12,45 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso def test_compressed_tensors_w8a8_static_setup(vllm_runner): model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" - llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model - layer = model.model.layers[0] + with vllm_runner(model_path, quantization="sparseml", + enforce_eager=True) as llm: + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) - assert qkv_proj.weight.dtype is torch.int8 - assert o_proj.weight.dtype is torch.int8 - assert gate_up_proj.weight.dtype is torch.int8 + assert qkv_proj.weight.dtype is torch.int8 + assert o_proj.weight.dtype is torch.int8 + assert gate_up_proj.weight.dtype is torch.int8 - assert qkv_proj.weight_scale.shard_splitter is not None - assert qkv_proj.weight_scale.logical_widths is not None - assert qkv_proj.input_scale.dtype is torch.float32 + assert qkv_proj.weight_scale.shard_splitter is not None + assert qkv_proj.weight_scale.logical_widths is not None + assert qkv_proj.input_scale.dtype is torch.float32 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): model_path = "nm-testing/tinyllama-one-shot-dynamic-test" - llm = vllm_runner(model_path, - quantization="sparseml", - enforce_eager=True, - dtype=torch.float16) - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model - layer = model.model.layers[0] + with vllm_runner(model_path, + quantization="sparseml", + enforce_eager=True, + dtype=torch.float16) as llm: + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) - assert qkv_proj.weight.dtype is torch.int8 + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) + assert qkv_proj.weight.dtype is torch.int8 diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 607544a1..fccce7f7 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1] capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), reason="FP8 is not supported on this GPU type.") def test_load_fp16_model(vllm_runner) -> None: - llm = vllm_runner("facebook/opt-125m", quantization="fp8") + with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - assert fc1.weight.dtype == torch.float8_e4m3fn + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + assert fc1.weight.dtype == torch.float8_e4m3fn diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 2e373cb8..64f3ce94 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -2,10 +2,8 @@ Run `pytest tests/samplers/test_beam_search.py`. """ -import gc import pytest -import torch # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. @@ -34,14 +32,9 @@ def test_beam_search_single_input( hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, max_tokens) - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - del vllm_model - # NOTE(woosuk): For some reason, the following GC is required to avoid - # GPU OOM errors in the following tests using `vllm_runner`. - gc.collect() - torch.cuda.empty_cache() + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_beam_search(example_prompts, + beam_width, max_tokens) for i in range(len(example_prompts)): hf_output_ids, _ = hf_outputs[i] diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 67b5168b..dc2482d8 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -22,11 +22,12 @@ def test_ignore_eos( dtype: str, max_tokens: int, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) - sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) + with vllm_runner(model, dtype=dtype) as vllm_model: + sampling_params = SamplingParams(max_tokens=max_tokens, + ignore_eos=True) - for prompt in example_prompts: - ignore_eos_output = vllm_model.model.generate( - prompt, sampling_params=sampling_params) - output_length = len(ignore_eos_output[0].outputs[0].token_ids) - assert output_length == max_tokens + for prompt in example_prompts: + ignore_eos_output = vllm_model.model.generate( + prompt, sampling_params=sampling_params) + output_length = len(ignore_eos_output[0].outputs[0].token_ids) + assert output_length == max_tokens diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 0ccbabff..29794701 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -14,46 +14,46 @@ def test_logits_processor_force_generate( model: str, dtype: str, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) - tokenizer = vllm_model.model.get_tokenizer() - repeat_times = 2 - enforced_answers = " vLLM" - vllm_token_ids = tokenizer.encode(enforced_answers, - add_special_tokens=False) - max_tokens = len(vllm_token_ids) * repeat_times + with vllm_runner(model, dtype=dtype) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + repeat_times = 2 + enforced_answers = " vLLM" + vllm_token_ids = tokenizer.encode(enforced_answers, + add_special_tokens=False) + max_tokens = len(vllm_token_ids) * repeat_times - def pick_vllm(token_ids, logits): - token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] - logits[token_id] = torch.finfo(logits.dtype).max - return logits + def pick_vllm(token_ids, logits): + token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] + logits[token_id] = torch.finfo(logits.dtype).max + return logits - params_with_logprobs = SamplingParams( - logits_processors=[pick_vllm], - prompt_logprobs=3, - max_tokens=max_tokens, - ) - - # test logits_processors when prompt_logprobs is not None - vllm_model.model._add_request( - example_prompts[0], - params=params_with_logprobs, - ) - - # test prompt_logprobs is not None - vllm_model.model._add_request( - example_prompts[1], - params=SamplingParams( + params_with_logprobs = SamplingParams( + logits_processors=[pick_vllm], prompt_logprobs=3, max_tokens=max_tokens, - ), - ) + ) - # test grouped requests - vllm_model.model._add_request( - example_prompts[2], - params=SamplingParams(max_tokens=max_tokens), - ) + # test logits_processors when prompt_logprobs is not None + vllm_model.model._add_request( + example_prompts[0], + params=params_with_logprobs, + ) - outputs = vllm_model.model._run_engine(use_tqdm=False) + # test prompt_logprobs is not None + vllm_model.model._add_request( + example_prompts[1], + params=SamplingParams( + prompt_logprobs=3, + max_tokens=max_tokens, + ), + ) - assert outputs[0].outputs[0].text == enforced_answers * repeat_times + # test grouped requests + vllm_model.model._add_request( + example_prompts[2], + params=SamplingParams(max_tokens=max_tokens), + ) + + outputs = vllm_model.model._run_engine(use_tqdm=False) + + assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 25d59391..233540cd 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -38,21 +38,21 @@ def test_get_prompt_logprobs( max_tokens=max_tokens, ) - vllm_model = vllm_runner( - model, - dtype=dtype, - max_logprobs=num_top_logprobs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) - vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_top_logprobs, - temperature=0.0, - detokenize=detokenize) - vllm_results = vllm_model.model.generate( - example_prompts, sampling_params=vllm_sampling_params) + with vllm_runner( + model, + dtype=dtype, + max_logprobs=num_top_logprobs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_sampling_params = SamplingParams(max_tokens=max_tokens, + logprobs=num_top_logprobs, + prompt_logprobs=num_top_logprobs, + temperature=0.0, + detokenize=detokenize) + vllm_results = vllm_model.model.generate( + example_prompts, sampling_params=vllm_sampling_params) # Test whether logprobs are included in the results. for result in vllm_results: diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 5e93238d..ed2fee1a 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -17,16 +17,27 @@ def test_ranks( num_top_logprobs = 5 num_prompt_logprobs = 5 - vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) + with vllm_runner(model, dtype=dtype, + max_logprobs=num_top_logprobs) as vllm_model: + + ## Test greedy logprobs ranks + vllm_sampling_params = SamplingParams( + temperature=0.0, + top_p=1.0, + max_tokens=max_tokens, + logprobs=num_top_logprobs, + prompt_logprobs=num_prompt_logprobs) + vllm_results = vllm_model.generate_w_logprobs(example_prompts, + vllm_sampling_params) + + ## Test non-greedy logprobs ranks + sampling_params = SamplingParams(temperature=1.0, + top_p=1.0, + max_tokens=max_tokens, + logprobs=num_top_logprobs, + prompt_logprobs=num_prompt_logprobs) + res = vllm_model.generate_w_logprobs(example_prompts, sampling_params) - ## Test greedy logprobs ranks - vllm_sampling_params = SamplingParams(temperature=0.0, - top_p=1.0, - max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_prompt_logprobs) - vllm_results = vllm_model.generate_w_logprobs(example_prompts, - vllm_sampling_params) for result in vllm_results: assert result[2] is not None assert len(result[2]) == len(result[0]) @@ -35,13 +46,6 @@ def test_ranks( assert token in logprobs assert logprobs[token].rank == 1 - ## Test non-greedy logprobs ranks - sampling_params = SamplingParams(temperature=1.0, - top_p=1.0, - max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_prompt_logprobs) - res = vllm_model.generate_w_logprobs(example_prompts, sampling_params) for result in res: assert result[2] is not None assert len(result[2]) == len(result[0]) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index fef5ff3f..88067f19 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5)) @pytest.fixture def vllm_model(vllm_runner): - vllm_model = vllm_runner(MODEL, dtype="half") - yield vllm_model - del vllm_model + with vllm_runner(MODEL, dtype="half") as vllm_model: + yield vllm_model @pytest.mark.parametrize("seed", RANDOM_SEEDS) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 648de4db..b558bfc6 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,4 +1,3 @@ -import gc import json import os import subprocess @@ -7,7 +6,6 @@ from unittest.mock import MagicMock, patch import openai import pytest import ray -import torch from vllm import SamplingParams # yapf: disable @@ -71,47 +69,43 @@ def test_can_deserialize_s3(vllm_runner): model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" - loaded_hf_model = vllm_runner(model_ref, + with vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=TensorizerConfig( tensorizer_uri=tensorized_path, num_readers=1, s3_endpoint="object.ord1.coreweave.com", - )) + )) as loaded_hf_model: - deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) + deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501 - assert deserialized_outputs + assert deserialized_outputs @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): - vllm_model = vllm_runner(model_ref) - model_path = tmp_path / (model_ref + ".tensors") - key_path = tmp_path / (model_ref + ".key") - outputs = vllm_model.generate(prompts, sampling_params) + with vllm_runner(model_ref) as vllm_model: + model_path = tmp_path / (model_ref + ".tensors") + key_path = tmp_path / (model_ref + ".key") + outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig(tensorizer_uri=model_path) - serialize_vllm_model(vllm_model.model.llm_engine, - config_for_serializing, - encryption_key_path=key_path) - - del vllm_model - gc.collect() - torch.cuda.empty_cache() + config_for_serializing = TensorizerConfig(tensorizer_uri=model_path) + serialize_vllm_model(vllm_model.model.llm_engine, + config_for_serializing, + encryption_key_path=key_path) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - loaded_vllm_model = vllm_runner( + with vllm_runner( model_ref, load_format="tensorizer", - model_loader_extra_config=config_for_deserializing) + model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501 - deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) + deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501 - assert outputs == deserialized_outputs + assert outputs == deserialized_outputs def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, @@ -124,17 +118,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, serializer = TensorSerializer(stream) serializer.write_module(hf_model.model) - loaded_hf_model = vllm_runner(model_ref, + with vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=TensorizerConfig( tensorizer_uri=model_path, num_readers=1, - )) + )) as loaded_hf_model: - deserialized_outputs = loaded_hf_model.generate_greedy( - prompts, max_tokens=max_tokens) + deserialized_outputs = loaded_hf_model.generate_greedy( + prompts, max_tokens=max_tokens) - assert outputs == deserialized_outputs + assert outputs == deserialized_outputs def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): @@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): test_prompts = create_test_prompts(lora_path) # Serialize model before deserializing and binding LoRA adapters - vllm_model = vllm_runner(model_ref, ) - model_path = tmp_path / (model_ref + ".tensors") + with vllm_runner(model_ref, ) as vllm_model: + model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(vllm_model.model.llm_engine, - TensorizerConfig(tensorizer_uri=model_path)) + serialize_vllm_model(vllm_model.model.llm_engine, + TensorizerConfig(tensorizer_uri=model_path)) - del vllm_model - gc.collect() - torch.cuda.empty_cache() - loaded_vllm_model = vllm_runner( + with vllm_runner( model_ref, load_format="tensorizer", model_loader_extra_config=TensorizerConfig( @@ -170,10 +161,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): max_cpu_loras=2, max_num_seqs=50, max_model_len=1000, - ) - process_requests(loaded_vllm_model.model.llm_engine, test_prompts) + ) as loaded_vllm_model: + process_requests(loaded_vllm_model.model.llm_engine, test_prompts) - assert loaded_vllm_model + assert loaded_vllm_model def test_load_without_tensorizer_load_format(vllm_runner): @@ -186,19 +177,15 @@ def test_load_without_tensorizer_load_format(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Serialize model - vllm_model = vllm_runner(model_ref, ) - model_path = tmp_path / (model_ref + ".tensors") + with vllm_runner(model_ref, ) as vllm_model: + model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(vllm_model.model.llm_engine, - TensorizerConfig(tensorizer_uri=model_path)) + serialize_vllm_model(vllm_model.model.llm_engine, + TensorizerConfig(tensorizer_uri=model_path)) - model_loader_extra_config = { - "tensorizer_uri": str(model_path), - } - - del vllm_model - gc.collect() - torch.cuda.empty_cache() + model_loader_extra_config = { + "tensorizer_uri": str(model_path), + } ## Start OpenAI API server openai_args = [ @@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) - vllm_model = vllm_runner(model_ref) - outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(vllm_model.model.llm_engine, config) + with vllm_runner(model_ref) as vllm_model: + outputs = vllm_model.generate(prompts, sampling_params) + serialize_vllm_model(vllm_model.model.llm_engine, config) - assert is_vllm_tensorized(config) - del vllm_model - gc.collect() - torch.cuda.empty_cache() + assert is_vllm_tensorized(config) - loaded_vllm_model = vllm_runner(model_ref, - load_format="tensorizer", - model_loader_extra_config=config) - deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) + with vllm_runner(model_ref, + load_format="tensorizer", + model_loader_extra_config=config) as loaded_vllm_model: + deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501 - assert outputs == deserialized_outputs + assert outputs == deserialized_outputs