
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
480 lines
21 KiB
Python
480 lines
21 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
"""Tests for rejection sampling."""
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from vllm.model_executor.layers.typical_acceptance_sampler import (
|
|
TypicalAcceptanceSampler)
|
|
from vllm.model_executor.utils import set_random_seed
|
|
|
|
CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
|
|
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def use_v0_only(monkeypatch):
|
|
"""
|
|
This file tests V0 internals, so set VLLM_USE_V1=0.
|
|
"""
|
|
monkeypatch.setenv('VLLM_USE_V1', '0')
|
|
|
|
|
|
def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
|
|
"""
|
|
Generates a fake temperature zero probability distribution.
|
|
Returns:
|
|
1. A fake temperature zero probability distribution of shape
|
|
[batch_size, k, vocab_size]
|
|
2. Tensor of shape [batch_size, k] containing the token ids
|
|
of the probability 1.0 tokens at each position.
|
|
"""
|
|
# Simulate temperature 0 probability distribution for target probabilities
|
|
# and create target probabilities such that only 1 token id has
|
|
# probability 1.0
|
|
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
|
probs = torch.rand(batch_size, k, vocab_size)
|
|
_, zero_temperature_token_ids = torch.max(probs, dim=-1)
|
|
# set the probability of the tokens with ids in zero_temperature_token_ids
|
|
# to 1 and the rest to 0.
|
|
target_probs = torch.zeros_like(probs).scatter_(
|
|
-1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
|
|
return target_probs, zero_temperature_token_ids
|
|
|
|
|
|
def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
|
|
token_ids_to_exclude: torch.Tensor):
|
|
"""
|
|
Returns a tensor of shape [batch_size, k] of fake draft token ids
|
|
drawn randomly from a vocab of size vocab_size. We however ensure
|
|
that token_ids from token_ids_to_exclude are excluded at the
|
|
corresponding positions.
|
|
"""
|
|
draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
|
|
for i in range(batch_size):
|
|
for j in range(k):
|
|
# Generate a random token ID excluding token_ids_to_exclude[i, j]
|
|
while True:
|
|
token_id = torch.randint(0, vocab_size, (1, )).item()
|
|
if token_id != token_ids_to_exclude[i, j]:
|
|
draft_token_ids[i, j] = token_id
|
|
break
|
|
return draft_token_ids
|
|
|
|
|
|
def get_acceptance_sampler(
|
|
posterior_threshold: float = 0.03,
|
|
posterior_alpha: float = 0.9,
|
|
strict_mode: bool = False,
|
|
) -> TypicalAcceptanceSampler:
|
|
"""
|
|
Initializes and returns a TypicalAcceptanceSampler.
|
|
"""
|
|
return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
|
|
strict_mode)
|
|
|
|
|
|
@pytest.mark.parametrize("k", list(range(1, 6)))
|
|
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
|
|
@pytest.mark.parametrize("batch_size", list(range(1, 32)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
|
|
device: str):
|
|
"""
|
|
Tests that the TypicalAcceptancSampler forward succeeds for
|
|
different combinations of k, vocab_size, batch_size and num devices.
|
|
"""
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler()
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
target_with_bonus_probs = torch.rand(batch_size,
|
|
k + 1,
|
|
vocab_size,
|
|
dtype=torch.float32)
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
draft_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, k),
|
|
dtype=torch.int64)
|
|
# Verify that sampling succeeds for all cases.
|
|
typical_acceptance_sampler(target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
|
|
|
|
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
|
|
@pytest.mark.parametrize("which_token_ids",
|
|
["bonus_token_ids", "draft_token_ids"])
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
|
|
which_token_ids: str, device: str):
|
|
"""
|
|
Tests that we throw an exception of the token ids fall outside
|
|
the bound of the provided vocabulary.
|
|
"""
|
|
k = 3
|
|
batch_size = 5
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
target_with_bonus_probs = torch.rand(batch_size,
|
|
k + 1,
|
|
vocab_size,
|
|
dtype=torch.float32)
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
draft_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, k),
|
|
dtype=torch.int64)
|
|
# Verify that appropriate exceptions are thrown for out
|
|
# of bound vocabs.
|
|
oob_token_ids = None
|
|
if which_token_ids == "bonus_token_ids":
|
|
oob_token_ids = bonus_token_ids
|
|
elif which_token_ids == "draft_token_ids":
|
|
oob_token_ids = draft_token_ids
|
|
else:
|
|
raise AssertionError()
|
|
|
|
if above_or_below_vocab_range == "above":
|
|
rogue_token_id = vocab_size + 1
|
|
elif above_or_below_vocab_range == "below":
|
|
rogue_token_id = -1
|
|
else:
|
|
raise AssertionError()
|
|
|
|
oob_token_ids[0][0] = rogue_token_id
|
|
|
|
with pytest.raises(AssertionError):
|
|
typical_acceptance_sampler(target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(10)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_uniform_target_distribution_accepts_all_tokens(
|
|
seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler with a uniform target probability
|
|
distribution.
|
|
|
|
This test verifies that when provided with a uniform target probability
|
|
distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
|
|
entropy of the uniform target distribution being high should lead to all
|
|
draft tokens being accepted.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 3
|
|
batch_size = 5
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
target_with_bonus_probs = torch.rand(batch_size,
|
|
k + 1,
|
|
vocab_size,
|
|
dtype=torch.float32)
|
|
draft_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, k),
|
|
dtype=torch.int64)
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
# We are using a uniform target probability distribution.
|
|
# For a uniform distribution the entropy is very high and it
|
|
# should lead to all draft tokens being accepted. Verify that.
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze())
|
|
|
|
assert torch.all(output_token_ids[:, :k] == draft_token_ids)
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(10)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_temperature_zero_target_distribution(seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler with a zero-temperature target
|
|
probability distribution.
|
|
|
|
This test verifies that when using a zero-temperature target probability
|
|
distribution, where only one token has a probability of 1.0, the
|
|
TypicalAcceptanceSampler correctly rejects all draft tokens that do not
|
|
match this probability. Additionally, it ensures that when all draft
|
|
tokens are rejected, the sampler falls back to greedy sampling to select a
|
|
single token from the target distribution.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 3
|
|
batch_size = 5
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
# Simulate temperature 0 probability distribution for target probabilities
|
|
# and create target probabilities such that only 1 token id has
|
|
# probability 1.0
|
|
target_with_bonus_probs, zero_temperature_token_ids = \
|
|
get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
|
|
zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
|
|
# Populate draft_token_ids such that they exclude the token_ids
|
|
# with probability = 1.0
|
|
draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
|
|
zero_temperature_token_ids)
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
# The target probaility distribution is a temperature zero distribution
|
|
# with zero entroy. Since our draft token ids don't match the probability
|
|
# 1.0 tokens in the target distribution we will reject all of them and
|
|
# fallback to the greedy sampling for selecting 1 token for each sequence.
|
|
# Verify the same.
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, -1] == -1)
|
|
assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
|
|
0])
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(10)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_mixed_target_distribution(seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler with a mixed target probability
|
|
distribution.
|
|
|
|
This test ensures that the TypicalAcceptanceSampler handles a mixed
|
|
target probability distribution correctly. Specifically, it uses a
|
|
zero-temperature distribution for some sequences and a uniform
|
|
distribution for others. The test verifies that:
|
|
|
|
- For sequences with a zero-temperature distribution, only the token
|
|
with a probability of 1.0 is accepted, and all other tokens are rejected.
|
|
- For sequences with a uniform distribution, all draft tokens are
|
|
accepted.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 3
|
|
batch_size = 4
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
# For sequences 0 and 2 set the distribution to a temperature
|
|
# zero distribution. For sequences 1 and 3 set it to a uniform
|
|
# distribution.
|
|
target_with_bonus_probs, zero_temperature_token_ids = \
|
|
get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
|
|
zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
|
|
target_probs = target_with_bonus_probs[:, :-1]
|
|
draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
|
|
zero_temperature_token_ids)
|
|
uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
|
|
target_probs[[1, 3]] = uniform_probs
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
# verify the shape of output_token_ids
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
# For sequences 0 and 2 verify that only 1 token is accepted
|
|
# which is the token with probability 1.0 in the target distribution
|
|
# at position 0.
|
|
assert torch.all(output_token_ids[[0, 2], 1:] == -1)
|
|
assert (torch.all(output_token_ids[[0, 2],
|
|
0] == zero_temperature_token_ids[[0, 2],
|
|
0]))
|
|
# For sequences 1 and 3 verify that all tokens are accepted since the
|
|
# target probability distribution is uniform. In addition verify that
|
|
# we also accept the bonus tokens.
|
|
assert torch.all(
|
|
output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
|
|
assert torch.all(output_token_ids[[1, 3], -1] != -1)
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(10)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_accept_tokens_partially(seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler's behavior when only a subset of draft
|
|
tokens should be accepted.
|
|
|
|
This test verifies that the TypicalAcceptanceSampler correctly accepts or
|
|
rejects draft tokens based on a zero-temperature target probability
|
|
distribution. Specifically, it ensures that:
|
|
|
|
- When all draft tokens match tokens with a probability of 1.0 in the
|
|
target distribution, all draft tokens are accepted.
|
|
- When only some draft tokens match tokens with a probability of 1.0 in
|
|
the target distribution, only those matching tokens are accepted, and the
|
|
rest are rejected.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 5
|
|
batch_size = 1
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
# Create a temperature zero target probability distribution and ensure
|
|
# all draft token ids correspond to the tokens with 1.0 probability.
|
|
# Verify that all of them are accepted.
|
|
target_with_bonus_probs, zero_temperature_token_ids = \
|
|
get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
|
|
zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
|
|
draft_token_ids = zero_temperature_token_ids
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
|
|
assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
|
|
# Next only keep the first 2 draft tokens same as the zero temperature
|
|
# tokens. For the remaining 3 choose some other tokens. In the
|
|
# response we will expect the first 2 tokens to be the same as the
|
|
# draft tokens and the recovered token and rest as -1
|
|
draft_token_ids_to_replace = get_draft_token_ids(
|
|
batch_size, k, vocab_size, zero_temperature_token_ids)
|
|
draft_token_ids = torch.cat(
|
|
(draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_with_bonus_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
|
|
assert torch.all(
|
|
output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2])
|
|
assert torch.all(output_token_ids[:, -3:] == -1)
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(1)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler with custom posterior thresholds and
|
|
alpha values. This test verifies that by modifying the posterior
|
|
thresholds and alpha values we can change the acceptance behavior of the
|
|
sampler.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 5
|
|
batch_size = 1
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
# Simulate temperature 0 probability distribution for target
|
|
# probabilities and create target probabilities such that only 1 token
|
|
# id has probability 1.0 and others have a very low probability of
|
|
# 0.00001. Populate draft_token_ids such that they exclude the token_ids
|
|
# with probability = 1.0. Without any changes to the posterior thresholds
|
|
# none of the draft tokens are accepted.
|
|
target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
|
|
batch_size, k + 1, vocab_size)
|
|
zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
|
|
target_probs[target_probs == 0] = 0.00001
|
|
draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
|
|
zero_temperature_token_ids)
|
|
bonus_token_ids = torch.randint(low=0,
|
|
high=vocab_size,
|
|
size=(batch_size, 1),
|
|
dtype=torch.int64)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, 1:-1] == -1)
|
|
|
|
# Change the posterior threshold values to 0.0 so that we will
|
|
# now accept even draft tokens with very low probability in the
|
|
# target distribution. Simulate and verify the same.
|
|
typical_acceptance_sampler = TypicalAcceptanceSampler(
|
|
strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
output_token_ids = typical_acceptance_sampler(
|
|
target_probs,
|
|
bonus_token_ids,
|
|
draft_probs=None,
|
|
draft_token_ids=draft_token_ids)
|
|
assert output_token_ids.shape[0] == batch_size
|
|
assert output_token_ids.shape[1] == (k + 1)
|
|
assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
|
|
assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
|
|
|
|
|
|
@pytest.mark.parametrize("seed", list(range(10)))
|
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
@torch.inference_mode()
|
|
def test_get_recovered_token_ids(seed: int, device: str):
|
|
"""
|
|
Test the TypicalAcceptanceSampler's method for generating
|
|
replacement token IDs.
|
|
|
|
This test verifies that the `_get_recovered_token_ids` method of the
|
|
TypicalAcceptanceSampler correctly identifies the token IDs to be used
|
|
as recovered token IDs based on the target probability distribution.
|
|
Specifically, it ensures that the method correctly identifies the
|
|
tokens with the highest probability for each sequence in the batch.
|
|
"""
|
|
set_random_seed(seed)
|
|
k = 10
|
|
batch_size = 5
|
|
vocab_size = 30_000
|
|
torch.set_default_device(device)
|
|
typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
|
|
typical_acceptance_sampler.init_gpu_tensors(device=device)
|
|
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
|
expected_replacement_tokens = torch.argmax(target_probs, dim=-1)
|
|
actual_replacement_tokens = (
|
|
typical_acceptance_sampler._get_recovered_token_ids(target_probs))
|
|
assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
|