[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
This commit is contained in:
youkaichao 2024-06-08 01:59:20 -07:00 committed by GitHub
parent 9fb900f90c
commit 8ea5e44a43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 431 additions and 470 deletions

View File

@ -46,12 +46,11 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]

View File

@ -43,17 +43,16 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]

View File

@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
@ -84,17 +83,16 @@ def test_preemption(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
@ -139,19 +137,18 @@ def test_swap(
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens) max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
swap_space=10, swap_space=10,
disable_log_stats=False, disable_log_stats=False,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, vllm_outputs = vllm_model.generate_beam_search(example_prompts,
max_tokens) beam_width, max_tokens)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = ( total_preemption = (
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption) vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i] hf_output_ids, _ = hf_outputs[i]
@ -196,28 +193,28 @@ def test_swap_infeasible(
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
example_prompts = example_prompts[:1] example_prompts = example_prompts[:1]
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
swap_space=10, swap_space=10,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
# Since beam search have more than 1 sequence, prefill + decode blocks # Since beam search have more than 1 sequence, prefill +
# are not enough to finish. # decode blocks are not enough to finish.
num_gpu_blocks_override=prefill_blocks + decode_blocks, num_gpu_blocks_override=prefill_blocks + decode_blocks,
max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
) ) as vllm_model:
sampling_params = SamplingParams(n=beam_width, sampling_params = SamplingParams(n=beam_width,
use_beam_search=True, use_beam_search=True,
temperature=0.0, temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
ignore_eos=True) ignore_eos=True)
req_outputs = vllm_model.model.generate( req_outputs = vllm_model.model.generate(
example_prompts, example_prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
) )
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT) ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
assert req_outputs[0].outputs[0].finish_reason == "length" assert req_outputs[0].outputs[0].finish_reason == "length"
@ -236,25 +233,26 @@ def test_preemption_infeasible(
BLOCK_SIZE = 16 BLOCK_SIZE = 16
prefill_blocks = 2 prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
# Not enough gpu blocks to complete a single sequence. # Not enough gpu blocks to complete a single sequence.
# preemption should happen, and the sequence should be # preemption should happen, and the sequence should be
# ignored instead of hanging forever. # ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
) ) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) sampling_params = SamplingParams(max_tokens=max_tokens,
req_outputs = vllm_model.model.generate( ignore_eos=True)
example_prompts, req_outputs = vllm_model.model.generate(
sampling_params=sampling_params, example_prompts,
) sampling_params=sampling_params,
)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
ARTIFICIAL_PREEMPTION_MAX_CNT)
del vllm_model
# Verify the request is ignored and not hang. # Verify the request is ignored and not hang.
for req_output in req_outputs: for req_output in req_outputs:
outputs = req_output.outputs outputs = req_output.outputs

View File

@ -493,7 +493,10 @@ class VllmRunner:
outputs.append(embedding) outputs.append(embedding)
return outputs return outputs
def __del__(self): def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
del self.model del self.model
cleanup() cleanup()

View File

@ -45,14 +45,13 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(model,
model, dtype=dtype,
dtype=dtype, tensor_parallel_size=2,
tensor_parallel_size=2, enforce_eager=enforce_eager,
enforce_eager=enforce_eager, distributed_executor_backend=distributed_executor_backend
distributed_executor_backend=distributed_executor_backend) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]

View File

@ -48,17 +48,16 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]

View File

@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@pytest.fixture @pytest.fixture
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model yield vllm_model
del vllm_model
def test_stop_reason(vllm_model, example_prompts): def test_stop_reason(vllm_model, example_prompts):

View File

@ -10,7 +10,8 @@ MAX_TOKENS = 200
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
return vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup

View File

@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] prompt_token_counts = [
# This test needs at least 2 prompts in a batch of different lengths to len(tokenizer.encode(p)) for p in example_prompts
# verify their token count is correct despite padding. ]
assert len(example_prompts) > 1, "at least 2 prompts are required" # This test needs at least 2 prompts in a batch of different lengths to
assert prompt_token_counts[0] != prompt_token_counts[1], ( # verify their token count is correct despite padding.
"prompts of different lengths are required") assert len(example_prompts) > 1, "at least 2 prompts are required"
vllm_prompt_token_count = sum(prompt_token_counts) assert prompt_token_counts[0] != prompt_token_counts[1], (
"prompts of different lengths are required")
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens) _ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_logger stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels( metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
assert vllm_prompt_token_count == metric_count, ( assert vllm_prompt_token_count == metric_count, (
f"prompt token count: {vllm_prompt_token_count!r}\n" f"prompt token count: {vllm_prompt_token_count!r}\n"
@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_logger stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i]) prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens. # vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens. # We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
assert vllm_generation_count == metric_count, ( assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\n" f"generation token count: {vllm_generation_count!r}\n"
@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None: served_model_name: List[str]) -> None:
vllm_model = vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.3, gpu_memory_utilization=0.3,
served_model_name=served_model_name) served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_logger stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
del vllm_model
if served_model_name is None or served_model_name == []: if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, ( assert metrics_tag_content == model, (

View File

@ -82,10 +82,9 @@ def test_models(
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, vllm_outputs = vllm_model.generate_greedy_logprobs(
max_tokens, example_prompts, max_tokens, num_logprobs)
num_logprobs)
# loop through the prompts to compare against the ground truth generations # loop through the prompts to compare against the ground truth generations
for prompt_idx in range(len(example_prompts)): for prompt_idx in range(len(example_prompts)):

View File

@ -37,9 +37,8 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
@ -57,9 +56,8 @@ def test_model_print(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model) model_runner.model)
del vllm_model

View File

@ -31,9 +31,8 @@ def test_models(
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model: with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts) hf_outputs = hf_model.encode(example_prompts)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
del vllm_model
similarities = compare_embeddings(hf_outputs, vllm_outputs) similarities = compare_embeddings(hf_outputs, vllm_outputs)
all_similarities = torch.stack(similarities) all_similarities = torch.stack(similarities)

View File

@ -70,32 +70,29 @@ def test_models(
model_name, revision = model model_name, revision = model
# Run marlin. # Run marlin.
gptq_marlin_model = vllm_runner(model_name=model_name, with vllm_runner(model_name=model_name,
revision=revision, revision=revision,
dtype=dtype, dtype=dtype,
quantization="marlin", quantization="marlin",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) tensor_parallel_size=1) as gptq_marlin_model:
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs) example_prompts[:-1], max_tokens, num_logprobs)
del gptq_marlin_model
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error _ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
# Run gptq. # Run gptq.
# The naive gptq kernel doesn't support bf16 yet. # The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel # Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel. # to fp16 gptq kernel.
gptq_model = vllm_runner(model_name=model_name, with vllm_runner(model_name=model_name,
revision=revision, revision=revision,
dtype="half", dtype="half",
quantization="gptq", quantization="gptq",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) tensor_parallel_size=1) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1], gptq_outputs = gptq_model.generate_greedy_logprobs(
max_tokens, example_prompts[:-1], max_tokens, num_logprobs)
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,

View File

@ -61,20 +61,16 @@ def test_models(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
marlin_24_model = vllm_runner(model_pair.model_marlin, with vllm_runner(model_pair.model_marlin,
dtype=dtype, dtype=dtype,
quantization="gptq_marlin_24") quantization="gptq_marlin_24") as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs( marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
del marlin_24_model
gptq_model = vllm_runner(model_pair.model_gptq, with vllm_runner(model_pair.model_gptq, dtype=dtype,
dtype=dtype, quantization="gptq") as gptq_model:
quantization="gptq") gptq_outputs = gptq_model.generate_greedy_logprobs(
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, example_prompts, max_tokens, num_logprobs)
max_tokens,
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,

View File

@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
for p in HF_IMAGE_PROMPTS for p in HF_IMAGE_PROMPTS
] ]
vllm_model = vllm_runner(model_id, with vllm_runner(model_id,
dtype=dtype, dtype=dtype,
enforce_eager=True, enforce_eager=True,
**vlm_config.as_cli_args_dict()) **vlm_config.as_cli_args_dict()) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens, max_tokens,
images=vllm_images) images=vllm_images)
del vllm_model
for i in range(len(HF_IMAGE_PROMPTS)): for i in range(len(HF_IMAGE_PROMPTS)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]

View File

@ -59,20 +59,16 @@ def test_models(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
marlin_model = vllm_runner(model_pair.model_marlin, with vllm_runner(model_pair.model_marlin,
dtype=dtype, dtype=dtype,
quantization="marlin") quantization="marlin") as marlin_model:
marlin_outputs = marlin_model.generate_greedy_logprobs( marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
del marlin_model
gptq_model = vllm_runner(model_pair.model_gptq, with vllm_runner(model_pair.model_gptq, dtype=dtype,
dtype=dtype, quantization="gptq") as gptq_model:
quantization="gptq") gptq_outputs = gptq_model.generate_greedy_logprobs(
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, example_prompts, max_tokens, num_logprobs)
max_tokens,
num_logprobs)
del gptq_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=gptq_outputs, outputs_0_lst=gptq_outputs,

View File

@ -30,11 +30,9 @@ def test_models(
hf_outputs = hf_model.generate_greedy_logprobs_limit( hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, vllm_outputs = vllm_model.generate_greedy_logprobs(
max_tokens, example_prompts, max_tokens, num_logprobs)
num_logprobs)
del vllm_model
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,

View File

@ -37,9 +37,8 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
@ -57,9 +56,8 @@ def test_model_print(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
# can be printed correctly. # can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker. print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model) model_runner.model)
del vllm_model

View File

@ -16,65 +16,65 @@ capability = capability[0] * 10 + capability[1]
capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None: def test_load_bnb_model(vllm_runner) -> None:
llm = vllm_runner('huggyllama/llama-7b', with vllm_runner('huggyllama/llama-7b',
quantization='bitsandbytes', quantization='bitsandbytes',
load_format='bitsandbytes', load_format='bitsandbytes',
enforce_eager=True) enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8 # check the weights in MLP & SelfAttention are quantized to torch.uint8
qweight = model.model.layers[0].mlp.gate_up_proj.qweight qweight = model.model.layers[0].mlp.gate_up_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].mlp.down_proj.qweight qweight = model.model.layers[0].mlp.down_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.o_proj.qweight qweight = model.model.layers[0].self_attn.o_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.qkv_proj.qweight qweight = model.model.layers[0].self_attn.qkv_proj.qweight
assert qweight.dtype == torch.uint8, ( assert qweight.dtype == torch.uint8, (
f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}') f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
# some weights should not be quantized # some weights should not be quantized
weight = model.lm_head.weight weight = model.lm_head.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'lm_head weight dtype should not be torch.uint8') 'lm_head weight dtype should not be torch.uint8')
weight = model.model.embed_tokens.weight weight = model.model.embed_tokens.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'embed_tokens weight dtype should not be torch.uint8') 'embed_tokens weight dtype should not be torch.uint8')
weight = model.model.layers[0].input_layernorm.weight weight = model.model.layers[0].input_layernorm.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8') 'input_layernorm weight dtype should not be torch.uint8')
weight = model.model.layers[0].post_attention_layernorm.weight weight = model.model.layers[0].post_attention_layernorm.weight
assert weight.dtype != torch.uint8, ( assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8') 'input_layernorm weight dtype should not be torch.uint8')
# check the output of the model is expected # check the output of the model is expected
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
logprobs=1, logprobs=1,
prompt_logprobs=1, prompt_logprobs=1,
max_tokens=8) max_tokens=8)
prompts = ['That which does not kill us', 'To be or not to be,'] prompts = ['That which does not kill us', 'To be or not to be,']
expected_outputs = [ expected_outputs = [
'That which does not kill us makes us stronger.', 'That which does not kill us makes us stronger.',
'To be or not to be, that is the question.' 'To be or not to be, that is the question.'
] ]
outputs = llm.generate(prompts, sampling_params=sampling_params) outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(outputs) == len(prompts) assert len(outputs) == len(prompts)
for index in range(len(outputs)): for index in range(len(outputs)):
# compare the first line of the output # compare the first line of the output
actual_output = outputs[index][1][0].split('\n', 1)[0] actual_output = outputs[index][1][0].split('\n', 1)[0]
expected_output = expected_outputs[index].split('\n', 1)[0] expected_output = expected_outputs[index].split('\n', 1)[0]
assert actual_output == expected_output, ( assert actual_output == expected_output, (
f'Expected: {expected_output}, but got: {actual_output}') f'Expected: {expected_output}, but got: {actual_output}')

View File

@ -12,42 +12,45 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
def test_compressed_tensors_w8a8_static_setup(vllm_runner): def test_compressed_tensors_w8a8_static_setup(vllm_runner):
model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) with vllm_runner(model_path, quantization="sparseml",
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model enforce_eager=True) as llm:
layer = model.model.layers[0] model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj down_proj = layer.mlp.down_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(gate_up_proj.quant_method,
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
assert qkv_proj.weight.dtype is torch.int8 assert qkv_proj.weight.dtype is torch.int8
assert o_proj.weight.dtype is torch.int8 assert o_proj.weight.dtype is torch.int8
assert gate_up_proj.weight.dtype is torch.int8 assert gate_up_proj.weight.dtype is torch.int8
assert qkv_proj.weight_scale.shard_splitter is not None assert qkv_proj.weight_scale.shard_splitter is not None
assert qkv_proj.weight_scale.logical_widths is not None assert qkv_proj.weight_scale.logical_widths is not None
assert qkv_proj.input_scale.dtype is torch.float32 assert qkv_proj.input_scale.dtype is torch.float32
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
model_path = "nm-testing/tinyllama-one-shot-dynamic-test" model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
llm = vllm_runner(model_path, with vllm_runner(model_path,
quantization="sparseml", quantization="sparseml",
enforce_eager=True, enforce_eager=True,
dtype=torch.float16) dtype=torch.float16) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0] layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
assert qkv_proj.weight.dtype is torch.int8 assert qkv_proj.weight.dtype is torch.int8

View File

@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_load_fp16_model(vllm_runner) -> None: def test_load_fp16_model(vllm_runner) -> None:
llm = vllm_runner("facebook/opt-125m", quantization="fp8") with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
fc1 = model.model.decoder.layers[0].fc1 fc1 = model.model.decoder.layers[0].fc1
assert isinstance(fc1.quant_method, Fp8LinearMethod) assert isinstance(fc1.quant_method, Fp8LinearMethod)
assert fc1.weight.dtype == torch.float8_e4m3fn assert fc1.weight.dtype == torch.float8_e4m3fn

View File

@ -2,10 +2,8 @@
Run `pytest tests/samplers/test_beam_search.py`. Run `pytest tests/samplers/test_beam_search.py`.
""" """
import gc
import pytest import pytest
import torch
# FIXME(zhuohan): The test can not pass if we: # FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256. # 1. Increase max_tokens to 256.
@ -34,14 +32,9 @@ def test_beam_search_single_input(
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
max_tokens) max_tokens)
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, vllm_outputs = vllm_model.generate_beam_search(example_prompts,
max_tokens) beam_width, max_tokens)
del vllm_model
# NOTE(woosuk): For some reason, the following GC is required to avoid
# GPU OOM errors in the following tests using `vllm_runner`.
gc.collect()
torch.cuda.empty_cache()
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, _ = hf_outputs[i] hf_output_ids, _ = hf_outputs[i]

View File

@ -22,11 +22,12 @@ def test_ignore_eos(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
for prompt in example_prompts: for prompt in example_prompts:
ignore_eos_output = vllm_model.model.generate( ignore_eos_output = vllm_model.model.generate(
prompt, sampling_params=sampling_params) prompt, sampling_params=sampling_params)
output_length = len(ignore_eos_output[0].outputs[0].token_ids) output_length = len(ignore_eos_output[0].outputs[0].token_ids)
assert output_length == max_tokens assert output_length == max_tokens

View File

@ -14,46 +14,46 @@ def test_logits_processor_force_generate(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
vllm_model = vllm_runner(model, dtype=dtype) with vllm_runner(model, dtype=dtype) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
repeat_times = 2 repeat_times = 2
enforced_answers = " vLLM" enforced_answers = " vLLM"
vllm_token_ids = tokenizer.encode(enforced_answers, vllm_token_ids = tokenizer.encode(enforced_answers,
add_special_tokens=False) add_special_tokens=False)
max_tokens = len(vllm_token_ids) * repeat_times max_tokens = len(vllm_token_ids) * repeat_times
def pick_vllm(token_ids, logits): def pick_vllm(token_ids, logits):
token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
logits[token_id] = torch.finfo(logits.dtype).max logits[token_id] = torch.finfo(logits.dtype).max
return logits return logits
params_with_logprobs = SamplingParams( params_with_logprobs = SamplingParams(
logits_processors=[pick_vllm], logits_processors=[pick_vllm],
prompt_logprobs=3,
max_tokens=max_tokens,
)
# test logits_processors when prompt_logprobs is not None
vllm_model.model._add_request(
example_prompts[0],
params=params_with_logprobs,
)
# test prompt_logprobs is not None
vllm_model.model._add_request(
example_prompts[1],
params=SamplingParams(
prompt_logprobs=3, prompt_logprobs=3,
max_tokens=max_tokens, max_tokens=max_tokens,
), )
)
# test grouped requests # test logits_processors when prompt_logprobs is not None
vllm_model.model._add_request( vllm_model.model._add_request(
example_prompts[2], example_prompts[0],
params=SamplingParams(max_tokens=max_tokens), params=params_with_logprobs,
) )
outputs = vllm_model.model._run_engine(use_tqdm=False) # test prompt_logprobs is not None
vllm_model.model._add_request(
example_prompts[1],
params=SamplingParams(
prompt_logprobs=3,
max_tokens=max_tokens,
),
)
assert outputs[0].outputs[0].text == enforced_answers * repeat_times # test grouped requests
vllm_model.model._add_request(
example_prompts[2],
params=SamplingParams(max_tokens=max_tokens),
)
outputs = vllm_model.model._run_engine(use_tqdm=False)
assert outputs[0].outputs[0].text == enforced_answers * repeat_times

View File

@ -38,21 +38,21 @@ def test_get_prompt_logprobs(
max_tokens=max_tokens, max_tokens=max_tokens,
) )
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_logprobs=num_top_logprobs, max_logprobs=num_top_logprobs,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) ) as vllm_model:
vllm_sampling_params = SamplingParams(max_tokens=max_tokens, vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
logprobs=num_top_logprobs, logprobs=num_top_logprobs,
prompt_logprobs=num_top_logprobs, prompt_logprobs=num_top_logprobs,
temperature=0.0, temperature=0.0,
detokenize=detokenize) detokenize=detokenize)
vllm_results = vllm_model.model.generate( vllm_results = vllm_model.model.generate(
example_prompts, sampling_params=vllm_sampling_params) example_prompts, sampling_params=vllm_sampling_params)
# Test whether logprobs are included in the results. # Test whether logprobs are included in the results.
for result in vllm_results: for result in vllm_results:

View File

@ -17,16 +17,27 @@ def test_ranks(
num_top_logprobs = 5 num_top_logprobs = 5
num_prompt_logprobs = 5 num_prompt_logprobs = 5
vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) with vllm_runner(model, dtype=dtype,
max_logprobs=num_top_logprobs) as vllm_model:
## Test greedy logprobs ranks
vllm_sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
vllm_results = vllm_model.generate_w_logprobs(example_prompts,
vllm_sampling_params)
## Test non-greedy logprobs ranks
sampling_params = SamplingParams(temperature=1.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
## Test greedy logprobs ranks
vllm_sampling_params = SamplingParams(temperature=0.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
vllm_results = vllm_model.generate_w_logprobs(example_prompts,
vllm_sampling_params)
for result in vllm_results: for result in vllm_results:
assert result[2] is not None assert result[2] is not None
assert len(result[2]) == len(result[0]) assert len(result[2]) == len(result[0])
@ -35,13 +46,6 @@ def test_ranks(
assert token in logprobs assert token in logprobs
assert logprobs[token].rank == 1 assert logprobs[token].rank == 1
## Test non-greedy logprobs ranks
sampling_params = SamplingParams(temperature=1.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
for result in res: for result in res:
assert result[2] is not None assert result[2] is not None
assert len(result[2]) == len(result[0]) assert len(result[2]) == len(result[0])

View File

@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5))
@pytest.fixture @pytest.fixture
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL, dtype="half") with vllm_runner(MODEL, dtype="half") as vllm_model:
yield vllm_model yield vllm_model
del vllm_model
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)

View File

@ -1,4 +1,3 @@
import gc
import json import json
import os import os
import subprocess import subprocess
@ -7,7 +6,6 @@ from unittest.mock import MagicMock, patch
import openai import openai
import pytest import pytest
import ray import ray
import torch
from vllm import SamplingParams from vllm import SamplingParams
# yapf: disable # yapf: disable
@ -71,47 +69,43 @@ def test_can_deserialize_s3(vllm_runner):
model_ref = "EleutherAI/pythia-1.4b" model_ref = "EleutherAI/pythia-1.4b"
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
loaded_hf_model = vllm_runner(model_ref, with vllm_runner(model_ref,
load_format="tensorizer", load_format="tensorizer",
model_loader_extra_config=TensorizerConfig( model_loader_extra_config=TensorizerConfig(
tensorizer_uri=tensorized_path, tensorizer_uri=tensorized_path,
num_readers=1, num_readers=1,
s3_endpoint="object.ord1.coreweave.com", s3_endpoint="object.ord1.coreweave.com",
)) )) as loaded_hf_model:
deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
assert deserialized_outputs assert deserialized_outputs
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_deserialized_encrypted_vllm_model_has_same_outputs( def test_deserialized_encrypted_vllm_model_has_same_outputs(
vllm_runner, tmp_path): vllm_runner, tmp_path):
vllm_model = vllm_runner(model_ref) with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
key_path = tmp_path / (model_ref + ".key") key_path = tmp_path / (model_ref + ".key")
outputs = vllm_model.generate(prompts, sampling_params) outputs = vllm_model.generate(prompts, sampling_params)
config_for_serializing = TensorizerConfig(tensorizer_uri=model_path) config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
serialize_vllm_model(vllm_model.model.llm_engine, serialize_vllm_model(vllm_model.model.llm_engine,
config_for_serializing, config_for_serializing,
encryption_key_path=key_path) encryption_key_path=key_path)
del vllm_model
gc.collect()
torch.cuda.empty_cache()
config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
encryption_keyfile=key_path) encryption_keyfile=key_path)
loaded_vllm_model = vllm_runner( with vllm_runner(
model_ref, model_ref,
load_format="tensorizer", load_format="tensorizer",
model_loader_extra_config=config_for_deserializing) model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
assert outputs == deserialized_outputs assert outputs == deserialized_outputs
def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
@ -124,17 +118,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
serializer = TensorSerializer(stream) serializer = TensorSerializer(stream)
serializer.write_module(hf_model.model) serializer.write_module(hf_model.model)
loaded_hf_model = vllm_runner(model_ref, with vllm_runner(model_ref,
load_format="tensorizer", load_format="tensorizer",
model_loader_extra_config=TensorizerConfig( model_loader_extra_config=TensorizerConfig(
tensorizer_uri=model_path, tensorizer_uri=model_path,
num_readers=1, num_readers=1,
)) )) as loaded_hf_model:
deserialized_outputs = loaded_hf_model.generate_greedy( deserialized_outputs = loaded_hf_model.generate_greedy(
prompts, max_tokens=max_tokens) prompts, max_tokens=max_tokens)
assert outputs == deserialized_outputs assert outputs == deserialized_outputs
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
test_prompts = create_test_prompts(lora_path) test_prompts = create_test_prompts(lora_path)
# Serialize model before deserializing and binding LoRA adapters # Serialize model before deserializing and binding LoRA adapters
vllm_model = vllm_runner(model_ref, ) with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
serialize_vllm_model(vllm_model.model.llm_engine, serialize_vllm_model(vllm_model.model.llm_engine,
TensorizerConfig(tensorizer_uri=model_path)) TensorizerConfig(tensorizer_uri=model_path))
del vllm_model with vllm_runner(
gc.collect()
torch.cuda.empty_cache()
loaded_vllm_model = vllm_runner(
model_ref, model_ref,
load_format="tensorizer", load_format="tensorizer",
model_loader_extra_config=TensorizerConfig( model_loader_extra_config=TensorizerConfig(
@ -170,10 +161,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
max_cpu_loras=2, max_cpu_loras=2,
max_num_seqs=50, max_num_seqs=50,
max_model_len=1000, max_model_len=1000,
) ) as loaded_vllm_model:
process_requests(loaded_vllm_model.model.llm_engine, test_prompts) process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
assert loaded_vllm_model assert loaded_vllm_model
def test_load_without_tensorizer_load_format(vllm_runner): def test_load_without_tensorizer_load_format(vllm_runner):
@ -186,19 +177,15 @@ def test_load_without_tensorizer_load_format(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
## Serialize model ## Serialize model
vllm_model = vllm_runner(model_ref, ) with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
serialize_vllm_model(vllm_model.model.llm_engine, serialize_vllm_model(vllm_model.model.llm_engine,
TensorizerConfig(tensorizer_uri=model_path)) TensorizerConfig(tensorizer_uri=model_path))
model_loader_extra_config = { model_loader_extra_config = {
"tensorizer_uri": str(model_path), "tensorizer_uri": str(model_path),
} }
del vllm_model
gc.collect()
torch.cuda.empty_cache()
## Start OpenAI API server ## Start OpenAI API server
openai_args = [ openai_args = [
@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
config = TensorizerConfig(tensorizer_uri=str(model_path)) config = TensorizerConfig(tensorizer_uri=str(model_path))
vllm_model = vllm_runner(model_ref) with vllm_runner(model_ref) as vllm_model:
outputs = vllm_model.generate(prompts, sampling_params) outputs = vllm_model.generate(prompts, sampling_params)
serialize_vllm_model(vllm_model.model.llm_engine, config) serialize_vllm_model(vllm_model.model.llm_engine, config)
assert is_vllm_tensorized(config) assert is_vllm_tensorized(config)
del vllm_model
gc.collect()
torch.cuda.empty_cache()
loaded_vllm_model = vllm_runner(model_ref, with vllm_runner(model_ref,
load_format="tensorizer", load_format="tensorizer",
model_loader_extra_config=config) model_loader_extra_config=config) as loaded_vllm_model:
deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
assert outputs == deserialized_outputs assert outputs == deserialized_outputs