
- **Add SPDX license headers to python source files** - **Check for SPDX headers using pre-commit** commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com>
175 lines
7.3 KiB
Python
175 lines
7.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
from typing import List
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from vllm import SamplingParams
|
|
|
|
from ..conftest import VllmRunner
|
|
|
|
MODELS = ["facebook/opt-125m"]
|
|
|
|
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
@pytest.mark.parametrize("dtype",
|
|
["float"]) # needed for comparing logprobs with HF
|
|
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
|
|
@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size
|
|
@pytest.mark.parametrize("detokenize", [True, False])
|
|
def test_get_prompt_logprobs(
|
|
hf_runner,
|
|
vllm_runner,
|
|
model,
|
|
dtype,
|
|
chunked_prefill_token_size: int,
|
|
num_top_logprobs: int,
|
|
detokenize: bool,
|
|
example_prompts,
|
|
):
|
|
max_num_seqs = 256
|
|
enable_chunked_prefill = False
|
|
max_num_batched_tokens = None
|
|
if chunked_prefill_token_size != -1:
|
|
enable_chunked_prefill = True
|
|
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
|
max_num_batched_tokens = chunked_prefill_token_size
|
|
|
|
max_tokens = 5
|
|
with hf_runner(model, dtype=dtype) as hf_model:
|
|
hf_logprobs = hf_model.generate_greedy_logprobs(
|
|
example_prompts,
|
|
max_tokens=max_tokens,
|
|
)
|
|
|
|
with vllm_runner(
|
|
model,
|
|
dtype=dtype,
|
|
max_logprobs=num_top_logprobs,
|
|
enable_chunked_prefill=enable_chunked_prefill,
|
|
max_num_batched_tokens=max_num_batched_tokens,
|
|
max_num_seqs=max_num_seqs,
|
|
) as vllm_model:
|
|
vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
|
|
logprobs=num_top_logprobs,
|
|
prompt_logprobs=num_top_logprobs,
|
|
temperature=0.0,
|
|
detokenize=detokenize)
|
|
vllm_results = vllm_model.model.generate(
|
|
example_prompts, sampling_params=vllm_sampling_params)
|
|
|
|
# Test whether logprobs are included in the results.
|
|
for result in vllm_results:
|
|
assert result.prompt_logprobs is not None
|
|
assert result.outputs[0].logprobs is not None
|
|
assert len(result.outputs[0].logprobs) == max_tokens
|
|
for logprobs in result.outputs[0].logprobs:
|
|
# If the output token is not included in the top X
|
|
# logprob, it can return 1 more data
|
|
assert (len(logprobs) == num_top_logprobs
|
|
or len(logprobs) == num_top_logprobs + 1)
|
|
output_text = result.outputs[0].text
|
|
output_string_from_most_likely_tokens_lst: List[str] = []
|
|
for top_logprobs in result.outputs[0].logprobs:
|
|
top_logprob = next(iter(top_logprobs.values()))
|
|
output_string_from_most_likely_tokens_lst.append(
|
|
top_logprob.decoded_token)
|
|
|
|
if detokenize:
|
|
output_string_from_most_likely_tokens = "".join(
|
|
output_string_from_most_likely_tokens_lst)
|
|
assert output_text == output_string_from_most_likely_tokens, (
|
|
"The output text from the top logprob for each token position "
|
|
"should be the same as the output text in the result.")
|
|
else:
|
|
assert output_text == ''
|
|
assert output_string_from_most_likely_tokens_lst == ([None] *
|
|
max_tokens)
|
|
|
|
# The first prompt logprob is always None
|
|
assert result.prompt_logprobs[0] is None
|
|
for prompt_logprobs in result.prompt_logprobs[1:]:
|
|
# If the prompt token is not included in the top X
|
|
# logprob, it can return 1 more data
|
|
assert (len(prompt_logprobs) == num_top_logprobs
|
|
or len(prompt_logprobs) == num_top_logprobs + 1)
|
|
|
|
# Test whether prompt logprobs are consistent with HF
|
|
for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
|
|
# Check prompt logprobs
|
|
# The first prompt logprob is always None, so we compare it from 1:.
|
|
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
|
|
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
|
|
for token_id, logprob in vllm_prompt_logprob_dict.items():
|
|
torch.testing.assert_close(logprob.logprob,
|
|
hf_logprob[0][i][token_id].item(),
|
|
atol=1e-2,
|
|
rtol=1e-2)
|
|
vllm_sample_logprobs = vllm_result.outputs[0].logprobs
|
|
for i, top_logprobs in enumerate(vllm_sample_logprobs):
|
|
for token_id, sample_logprob in top_logprobs.items():
|
|
logprob = sample_logprob.logprob
|
|
torch.testing.assert_close(logprob,
|
|
hf_logprob[i][-1][token_id].item(),
|
|
atol=1e-2,
|
|
rtol=1e-2)
|
|
if detokenize:
|
|
assert isinstance(sample_logprob.decoded_token, str), (
|
|
"The token should be decoded by the time it is returned"
|
|
" to the user.")
|
|
|
|
# Test if prompt logprobs are correctly set.
|
|
for vllm_result in vllm_results:
|
|
token_ids = vllm_result.prompt_token_ids
|
|
prompt_logprobs = vllm_result.prompt_logprobs
|
|
|
|
# The first token doesn't have logprob.
|
|
assert prompt_logprobs[0] is None
|
|
|
|
for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
|
|
assert token_id in logprob_dict
|
|
|
|
|
|
def test_max_logprobs():
|
|
runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
|
|
vllm_sampling_params = SamplingParams(logprobs=1)
|
|
# should pass
|
|
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
|
|
|
bad_sampling_params = SamplingParams(logprobs=2)
|
|
with pytest.raises(ValueError):
|
|
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
|
|
|
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
|
|
@pytest.mark.parametrize("detokenize", [True, False])
|
|
def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
|
|
detokenize: bool, example_prompts):
|
|
max_num_seqs = 256
|
|
enable_chunked_prefill = False
|
|
max_num_batched_tokens = None
|
|
if chunked_prefill_token_size != -1:
|
|
enable_chunked_prefill = True
|
|
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
|
max_num_batched_tokens = chunked_prefill_token_size
|
|
max_tokens = 5
|
|
|
|
with vllm_runner(
|
|
model,
|
|
enable_chunked_prefill=enable_chunked_prefill,
|
|
max_num_batched_tokens=max_num_batched_tokens,
|
|
max_num_seqs=max_num_seqs,
|
|
) as vllm_model:
|
|
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
|
|
logprobs=None,
|
|
temperature=0.0,
|
|
detokenize=detokenize)
|
|
results_logprobs_none = vllm_model.model.generate(
|
|
example_prompts, sampling_params=sampling_params_logprobs_none)
|
|
|
|
for i in range(len(results_logprobs_none)):
|
|
assert results_logprobs_none[i].outputs[0].logprobs is None
|
|
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
|