vllm/tests/samplers/test_logprobs.py

import pytest
import torch

from vllm import SamplingParams

from ..conftest import VllmRunner

MODELS = ["facebook/opt-125m"]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
def test_get_prompt_logprobs(
    hf_runner,
    vllm_runner,
    model,
    dtype,
    chunked_prefill_token_size: int,
    num_top_logprobs: int,
    example_prompts,
):
    max_num_seqs = 256
    enable_chunked_prefill = False
    max_num_batched_tokens = None
    if chunked_prefill_token_size != -1:
        enable_chunked_prefill = True
        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
        max_num_batched_tokens = chunked_prefill_token_size

    max_tokens = 5
    hf_model = hf_runner(model, dtype=dtype)
    hf_logprobs = hf_model.generate_greedy_logprobs(
        example_prompts,
        max_tokens=max_tokens,
    )
    del hf_model

    vllm_model = vllm_runner(
        model,
        dtype=dtype,
        max_logprobs=num_top_logprobs,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
        max_num_seqs=max_num_seqs,
    )
    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                          logprobs=num_top_logprobs,
                                          prompt_logprobs=num_top_logprobs,
                                          temperature=0.0)
    vllm_results = vllm_model.model.generate(
        example_prompts, sampling_params=vllm_sampling_params)

    # Test whether logprobs are included in the results.
    for result in vllm_results:
        assert result.prompt_logprobs is not None
        assert result.outputs[0].logprobs is not None
        assert len(result.outputs[0].logprobs) == max_tokens
        for logprobs in result.outputs[0].logprobs:
            assert len(logprobs) == num_top_logprobs
        output_text = result.outputs[0].text
        output_string_from_most_likely_tokens = []
        for top_logprobs in result.outputs[0].logprobs:
            top_logprob = next(iter(top_logprobs.values()))
            output_string_from_most_likely_tokens.append(
                top_logprob.decoded_token)
        output_string_from_most_likely_tokens = "".join(
            output_string_from_most_likely_tokens)
        assert output_text == output_string_from_most_likely_tokens, (
            "The output text from the top logprob for each token position "
            "should be the same as the output text in the result.")

        # The first prompt logprob is always None
        assert result.prompt_logprobs[0] is None
        for prompt_logprobs in result.prompt_logprobs[1:]:
            # If the prompt token is not included in the top X
            # logprob, it can return 1 more data
            assert (len(prompt_logprobs) == num_top_logprobs
                    or len(prompt_logprobs) == num_top_logprobs + 1)

    # Test whether prompt logprobs are consistent with HF
    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
        # Check prompt logprobs
        # The first prompt logprob is always None, so we compare it from 1:.
        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
            for token_id, logprob in vllm_prompt_logprob_dict.items():
                torch.testing.assert_close(logprob.logprob,
                                           hf_logprob[0][i][token_id].item(),
                                           atol=1e-2,
                                           rtol=1e-2)
        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
        for i, top_logprobs in enumerate(vllm_sample_logprobs):
            for token_id, sample_logprob in top_logprobs.items():
                logprob = sample_logprob.logprob
                torch.testing.assert_close(logprob,
                                           hf_logprob[i][-1][token_id].item(),
                                           atol=1e-2,
                                           rtol=1e-2)
                assert isinstance(sample_logprob.decoded_token, str), (
                    "The token should be decoded by the time it is returned "
                    " to the user.")

    # Test if prompt logprobs are correctly set.
    for vllm_result in vllm_results:
        token_ids = vllm_result.prompt_token_ids
        prompt_logprobs = vllm_result.prompt_logprobs

        # The first token doesn't have logprob.
        assert prompt_logprobs[0] is None

        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
            assert token_id in logprob_dict


def test_max_logprobs():
    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
    vllm_sampling_params = SamplingParams(logprobs=1)
    # should pass
    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)

    bad_sampling_params = SamplingParams(logprobs=2)
    with pytest.raises(ValueError):
        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`import pytest`
			`import torch`

			`from vllm import SamplingParams`

[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00			`from ..conftest import VllmRunner`

Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`MODELS = ["facebook/opt-125m"]`


			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["half"])`
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])`
			`@pytest.mark.parametrize("num_top_logprobs", [6]) # 32000 == vocab_size`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`def test_get_prompt_logprobs(`
			`hf_runner,`
			`vllm_runner,`
			`model,`
			`dtype,`
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`chunked_prefill_token_size: int,`
			`num_top_logprobs: int,`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`example_prompts,`
			`):`
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`max_num_seqs = 256`
			`enable_chunked_prefill = False`
			`max_num_batched_tokens = None`
			`if chunked_prefill_token_size != -1:`
			`enable_chunked_prefill = True`
			`max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)`
			`max_num_batched_tokens = chunked_prefill_token_size`

Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`max_tokens = 5`
			`hf_model = hf_runner(model, dtype=dtype)`
			`hf_logprobs = hf_model.generate_greedy_logprobs(`
			`example_prompts,`
			`max_tokens=max_tokens,`
			`)`
			`del hf_model`

[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`vllm_model = vllm_runner(`
			`model,`
			`dtype=dtype,`
			`max_logprobs=num_top_logprobs,`
			`enable_chunked_prefill=enable_chunked_prefill,`
			`max_num_batched_tokens=max_num_batched_tokens,`
			`max_num_seqs=max_num_seqs,`
			`)`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`vllm_sampling_params = SamplingParams(max_tokens=max_tokens,`
Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00			`logprobs=num_top_logprobs,`
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`prompt_logprobs=num_top_logprobs,`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`temperature=0.0)`
			`vllm_results = vllm_model.model.generate(`
			`example_prompts, sampling_params=vllm_sampling_params)`

			`# Test whether logprobs are included in the results.`
			`for result in vllm_results:`
			`assert result.prompt_logprobs is not None`
			`assert result.outputs[0].logprobs is not None`
Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00			`assert len(result.outputs[0].logprobs) == max_tokens`
			`for logprobs in result.outputs[0].logprobs:`
			`assert len(logprobs) == num_top_logprobs`
			`output_text = result.outputs[0].text`
			`output_string_from_most_likely_tokens = []`
			`for top_logprobs in result.outputs[0].logprobs:`
			`top_logprob = next(iter(top_logprobs.values()))`
			`output_string_from_most_likely_tokens.append(`
			`top_logprob.decoded_token)`
			`output_string_from_most_likely_tokens = "".join(`
			`output_string_from_most_likely_tokens)`
			`assert output_text == output_string_from_most_likely_tokens, (`
			`"The output text from the top logprob for each token position "`
			`"should be the same as the output text in the result.")`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`# The first prompt logprob is always None`
			`assert result.prompt_logprobs[0] is None`
			`for prompt_logprobs in result.prompt_logprobs[1:]:`
			`# If the prompt token is not included in the top X`
			`# logprob, it can return 1 more data`
			`assert (len(prompt_logprobs) == num_top_logprobs`
			`or len(prompt_logprobs) == num_top_logprobs + 1)`

Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`# Test whether prompt logprobs are consistent with HF`
			`for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):`
			`# Check prompt logprobs`
[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`# The first prompt logprob is always None, so we compare it from 1:.`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]`
			`for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):`
			`for token_id, logprob in vllm_prompt_logprob_dict.items():`
Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00			`torch.testing.assert_close(logprob.logprob,`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`hf_logprob[0][i][token_id].item(),`
			`atol=1e-2,`
			`rtol=1e-2)`
			`vllm_sample_logprobs = vllm_result.outputs[0].logprobs`
Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00			`for i, top_logprobs in enumerate(vllm_sample_logprobs):`
			`for token_id, sample_logprob in top_logprobs.items():`
			`logprob = sample_logprob.logprob`
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`torch.testing.assert_close(logprob,`
			`hf_logprob[i][-1][token_id].item(),`
			`atol=1e-2,`
			`rtol=1e-2)`
Re-enable the 80 char line width limit (#3305) 2024-03-10 19:49:14 -07:00			`assert isinstance(sample_logprob.decoded_token, str), (`
			`"The token should be decoded by the time it is returned "`
Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00			`" to the user.")`

[Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 22:02:02 +09:00			`# Test if prompt logprobs are correctly set.`
			`for vllm_result in vllm_results:`
			`token_ids = vllm_result.prompt_token_ids`
			`prompt_logprobs = vllm_result.prompt_logprobs`

			`# The first token doesn't have logprob.`
			`assert prompt_logprobs[0] is None`

			`for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):`
			`assert token_id in logprob_dict`

Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-03-04 11:54:06 -08:00
			`def test_max_logprobs():`
			`runner = VllmRunner("facebook/opt-125m", max_logprobs=1)`
			`vllm_sampling_params = SamplingParams(logprobs=1)`
			`# should pass`
			`runner.generate(["Hello world"], sampling_params=vllm_sampling_params)`

			`bad_sampling_params = SamplingParams(logprobs=2)`
			`with pytest.raises(ValueError):`
			`runner.generate(["Hello world"], sampling_params=bad_sampling_params)`