577 lines
22 KiB
Python
577 lines
22 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import pytest
|
|
|
|
from vllm.core.block.block_table import BlockTable
|
|
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
|
from vllm.utils import Device, cdiv, chunk_list
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [16])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
def test_allocate_naive(block_size: int, sequence_len: int):
|
|
"""Test the allocation of blocks using the naive allocator.
|
|
|
|
This test creates a CpuGpuBlockAllocator with the specified block size and
|
|
number of blocks. It then allocates multiple BlockTables with varying
|
|
sequence lengths and verifies that the number of free blocks decreases as
|
|
expected after each allocation.
|
|
"""
|
|
assert block_size > 1
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type="naive",
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=1024,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
|
|
|
block_tables: list[BlockTable] = []
|
|
for i in range(5):
|
|
assert allocator.get_num_free_blocks(
|
|
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
|
|
|
block_tables.append(
|
|
BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
))
|
|
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [16])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
|
"""Test the allocation of blocks using the prefix caching allocator.
|
|
|
|
This test creates a CpuGpuBlockAllocator with the specified block size and
|
|
number of blocks, using the prefix caching allocator. It then allocates
|
|
multiple BlockTables with varying sequence lengths and verifies that the
|
|
number of free blocks decreases as expected after each allocation.
|
|
|
|
The test expects all sequences to share allocations, except for their last
|
|
block, which may be mutable. It calculates the expected number of immutable
|
|
and mutable blocks per allocation based on the sequence length and block
|
|
size.
|
|
"""
|
|
assert block_size > 1
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type="prefix_caching",
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=1024,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
chunked_tokens = list(chunk_list(token_ids, block_size))
|
|
num_mutable_blocks_per_alloc = 0 if len(
|
|
chunked_tokens[-1]) == block_size else 1
|
|
num_immutable_blocks_per_alloc = len(
|
|
chunked_tokens) - num_mutable_blocks_per_alloc
|
|
|
|
block_tables: list[BlockTable] = []
|
|
for alloc_i in range(1, 6):
|
|
|
|
block_tables.append(
|
|
BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
))
|
|
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
# Expect all sequences to share allocations, except for their last block
|
|
# (which may be mutable).
|
|
assert allocator.get_num_free_blocks(
|
|
device=Device.GPU) == num_gpu_blocks - (
|
|
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
|
|
(alloc_i))
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [16])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
@pytest.mark.parametrize("device", ["cpu", "gpu"])
|
|
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
|
|
device: str):
|
|
"""Test the allocation and freeing of blocks using different allocators and
|
|
devices.
|
|
|
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
|
number of blocks, allocator type, and device. It then allocates a BlockTable
|
|
multiple times with the same sequence and verifies that the number of free
|
|
blocks remains consistent after each allocation and freeing.
|
|
"""
|
|
device = Device[device.upper()]
|
|
|
|
num_device_blocks = 1024
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_device_blocks,
|
|
num_cpu_blocks=num_device_blocks,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
for i in range(5):
|
|
block_table.allocate(token_ids=token_ids, device=device)
|
|
assert allocator.get_num_free_blocks(
|
|
device) == num_device_blocks - num_blocks_per_alloc
|
|
assert all(block_id is not None
|
|
for block_id in block_table.physical_block_ids)
|
|
|
|
block_table.free()
|
|
assert allocator.get_num_free_blocks(device) == num_device_blocks
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [1, 8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
|
|
append_len: int, allocator_type: str):
|
|
"""Test the allocation behavior when appending token IDs to a BlockTable.
|
|
|
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
|
number of blocks, and allocator type. It then allocates a BlockTable with an
|
|
initial sequence and appends additional token IDs to it. The test verifies
|
|
that the number of allocated blocks before and after appending matches the
|
|
expected values.
|
|
"""
|
|
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=1024,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
token_ids_to_append = list(range(append_len))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
num_expected_blocks_before_append = len(
|
|
list(chunk_list(token_ids, block_size)))
|
|
num_expected_appended_blocks = len(
|
|
list(chunk_list(token_ids + token_ids_to_append,
|
|
block_size))) - num_expected_blocks_before_append
|
|
|
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
assert len(
|
|
block_table.physical_block_ids) == num_expected_blocks_before_append
|
|
block_table.append_token_ids(token_ids_to_append)
|
|
assert len(
|
|
block_table.physical_block_ids
|
|
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [1, 8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
|
|
num_empty_slots: int,
|
|
allocator_type: str):
|
|
"""Test the allocation behavior when ensuring a certain number of empty
|
|
slots in a BlockTable.
|
|
|
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
|
number of blocks, and allocator type. It then allocates a BlockTable with an
|
|
initial sequence and ensures a certain number of empty slots. The test
|
|
verifies that the number of allocated blocks before and after ensuring empty
|
|
slots matches the expected values. It also checks that filling up the empty
|
|
slots does not consume additional blocks.
|
|
"""
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=1024,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
num_expected_blocks_before_append = len(
|
|
list(chunk_list(token_ids, block_size)))
|
|
num_expected_appended_blocks = len(
|
|
list(chunk_list(token_ids + [-1] * num_empty_slots,
|
|
block_size))) - num_expected_blocks_before_append
|
|
|
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
# Assert that the empty slots consume the expected number of additional
|
|
# blocks.
|
|
assert len(
|
|
block_table.physical_block_ids) == num_expected_blocks_before_append
|
|
block_table.ensure_num_empty_slots(num_empty_slots)
|
|
assert len(
|
|
block_table.physical_block_ids
|
|
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
|
|
|
# Now, ensure no additional blocks consumed as we fill up the empty slots.
|
|
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
|
|
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
|
|
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [1, 8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 9])
|
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("append_size", [1, 4, 129])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
|
append_len: int, allocator_type: str,
|
|
append_size: int):
|
|
"""Verify token ids are correctly appended. Appends various amounts of
|
|
token ids in various append sizes, and verifies the final sequence is
|
|
correct.
|
|
"""
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=1024,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
token_ids_to_append = list(range(append_len))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
appended_so_far: list[int] = []
|
|
for append in chunk_list(token_ids_to_append, append_size):
|
|
block_table.append_token_ids(append)
|
|
appended_so_far.extend(append)
|
|
|
|
assert block_table._get_all_token_ids() == token_ids + appended_so_far
|
|
|
|
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
|
|
|
|
|
|
@pytest.mark.parametrize("seq_len", [1, 9, 129])
|
|
@pytest.mark.parametrize("block_size", [1, 8])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_fork(seq_len: int, block_size: int, allocator_type: str):
|
|
"""Create a sequence using the specified allocator.
|
|
1. Assert that after forking the sequence, the free block count is the
|
|
same.
|
|
2. Assert that the forked sequence has the same physical mappings.
|
|
3. Then free the original sequence; verify that the free block count is
|
|
the same.
|
|
4. Finally, free the forked sequence and verify that the free block
|
|
count drops to zero.
|
|
"""
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=0,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(seq_len))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
block_table.allocate(token_ids)
|
|
|
|
num_free_blocks_before_fork = allocator.get_num_free_blocks(
|
|
device=Device.GPU)
|
|
|
|
forked_block_table = block_table.fork()
|
|
|
|
# Expect physical_block_ids and token_ids to match.
|
|
assert (block_table.physical_block_ids ==
|
|
forked_block_table.physical_block_ids)
|
|
assert block_table._get_all_token_ids(
|
|
) == forked_block_table._get_all_token_ids()
|
|
|
|
# Do not expect any additional allocations.
|
|
assert allocator.get_num_free_blocks(
|
|
device=Device.GPU) == num_free_blocks_before_fork
|
|
|
|
# Free the original blocks. Assert num free blocks does not change, since
|
|
# refcount is nonzero.
|
|
block_table.free()
|
|
assert allocator.get_num_free_blocks(
|
|
device=Device.GPU) == num_free_blocks_before_fork
|
|
|
|
# Expect the forked block table to be unaffected by the free.
|
|
assert all(block_id is not None
|
|
for block_id in forked_block_table.physical_block_ids)
|
|
|
|
# Free the forked blocks. Assert num free blocks does change, since
|
|
# refcount is now zero.
|
|
forked_block_table.free()
|
|
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("appender", ["forked", "original"])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_cow(block_size: int, sequence_len: int, append_len: int,
|
|
allocator_type: str, appender: str):
|
|
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
|
|
"""
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=0,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
token_ids_to_append = list(range(append_len))
|
|
|
|
original_block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
|
|
num_expected_cow_blocks = cdiv(sequence_len + append_len,
|
|
block_size) - (sequence_len // block_size)
|
|
|
|
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
original_block_ids = original_block_table.physical_block_ids[:]
|
|
|
|
print("original_block_ids = {}".format(original_block_ids))
|
|
forked_block_table = original_block_table.fork()
|
|
|
|
# Expect no additional allocation (copy on _write_).
|
|
assert allocator.get_num_free_blocks(
|
|
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
|
|
|
|
if appender == "forked":
|
|
appender_block_table = forked_block_table
|
|
static_block_table = original_block_table
|
|
elif appender == "original":
|
|
appender_block_table = original_block_table
|
|
static_block_table = forked_block_table
|
|
else:
|
|
raise ValueError(f"unknown test config {appender=}")
|
|
|
|
# Write tokens.
|
|
appender_block_table.append_token_ids(token_ids_to_append)
|
|
|
|
# Expect the non-appending block table to have no change.
|
|
assert static_block_table.physical_block_ids == original_block_ids
|
|
assert appender_block_table.physical_block_ids != original_block_ids
|
|
|
|
# Expect the blocks changed during append to have a CoW.
|
|
assert allocator.get_num_free_blocks(
|
|
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
|
|
num_expected_cow_blocks)
|
|
|
|
cows = allocator.clear_copy_on_writes()
|
|
if sequence_len % block_size > 0:
|
|
# If the last block in the sequence is not full, then when appending we
|
|
# expect a CoW.
|
|
assert cows
|
|
|
|
cow_block_id = sequence_len // block_size
|
|
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
|
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
|
|
|
assert (expected_src, expected_dst) in cows
|
|
else:
|
|
# Otherwise, there should be no copy-on-write.
|
|
assert not cows
|
|
|
|
static_block_table.free()
|
|
appender_block_table.free()
|
|
|
|
# After free, expect all blocks to be freed.
|
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
|
|
@pytest.mark.parametrize("appender", ["forked", "original"])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
|
append_len: int, lookahead_slots: int,
|
|
allocator_type: str, appender: str):
|
|
"""Similar to test_cow, except with lookahead allocation. The assertions are
|
|
less rigorous due to the complexity of the property under test.
|
|
"""
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=0,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
token_ids_to_append = list(range(append_len))
|
|
|
|
original_block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
# Allocate lookahead slots.
|
|
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
|
original_block_ids = original_block_table.physical_block_ids[:]
|
|
|
|
forked_block_table = original_block_table.fork()
|
|
|
|
if appender == "forked":
|
|
appender_block_table = forked_block_table
|
|
static_block_table = original_block_table
|
|
elif appender == "original":
|
|
appender_block_table = original_block_table
|
|
static_block_table = forked_block_table
|
|
else:
|
|
raise ValueError(f"unknown test config {appender=}")
|
|
|
|
# Write tokens.
|
|
appender_block_table.append_token_ids(token_ids_to_append)
|
|
|
|
# Expect the non-appending block table to have no change.
|
|
assert static_block_table.physical_block_ids == original_block_ids
|
|
assert appender_block_table.physical_block_ids != original_block_ids
|
|
|
|
cows = allocator.clear_copy_on_writes()
|
|
|
|
# Always expect copy-on-write
|
|
assert cows
|
|
|
|
if sequence_len % block_size > 0:
|
|
# If the last block in the sequence is not full, then when appending we
|
|
# expect a CoW.
|
|
assert cows
|
|
|
|
cow_block_id = sequence_len // block_size
|
|
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
|
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
|
|
|
assert (expected_src, expected_dst) in cows
|
|
|
|
static_block_table.free()
|
|
appender_block_table.free()
|
|
|
|
# After free, expect all blocks to be freed.
|
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
|
|
|
|
|
@pytest.mark.parametrize("block_size", [1, 8])
|
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
|
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
|
|
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
|
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
|
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
|
|
num_new_tokens: int,
|
|
num_lookahead_slots: int,
|
|
allocator_type: str):
|
|
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
|
|
|
|
This is done by using copy-on-write, which requires any modified block to
|
|
be copied before write if the refcount > 1. We set the refcount>1 by forking
|
|
a sequence, then measure the free blocks before and after an append. If the
|
|
number of consumed blocks equals what `get_num_blocks_touched_by_append_
|
|
slots` returns, then the calculation is correct.
|
|
"""
|
|
|
|
num_gpu_blocks = 1024
|
|
|
|
allocator = CpuGpuBlockAllocator.create(
|
|
allocator_type=allocator_type,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
num_cpu_blocks=0,
|
|
block_size=block_size,
|
|
)
|
|
|
|
token_ids = list(range(sequence_len))
|
|
token_ids_to_append = list(range(num_new_tokens))
|
|
|
|
block_table = BlockTable(
|
|
block_size=block_size,
|
|
block_allocator=allocator,
|
|
)
|
|
|
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
|
|
|
# Add lookahead before fork so both sequences have the same lookahead
|
|
# blocks.
|
|
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
|
|
|
|
# Fork sequence so that every block has refcount > 1.
|
|
_ = block_table.fork()
|
|
|
|
# Determine how many blocks should be touched.
|
|
expected_num_touched_blocks = (
|
|
block_table.get_num_blocks_touched_by_append_slots(
|
|
token_ids=token_ids_to_append,
|
|
num_lookahead_slots=num_lookahead_slots))
|
|
|
|
# Measure how many blocks are touched by measuring num_free_blocks before
|
|
# and after the append.
|
|
#
|
|
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
|
|
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
|
|
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
|
|
num_consumed_blocks = (num_free_blocks_before_append -
|
|
allocator.get_num_free_blocks(Device.GPU))
|
|
|
|
# TODO(cade) ensure equality when num_lookahead_slots > 0.
|
|
# The reason we have < is because lookahead blocks are not copied eagerly;
|
|
# they are copied on first write. This will cause issues for beam search +
|
|
# speculative decoding. This is acceptable for now as it is a large effort
|
|
# to combine the two. To fix this, we can ensure single sequence ownership
|
|
# of lookahead blocks by appending empty slots to each block, which will
|
|
# trigger the CoW.
|
|
#
|
|
# Until then, we can accept that the consumed tokens are <= the expected
|
|
# tokens when appending with lookahead.
|
|
if num_lookahead_slots > 0:
|
|
assert num_consumed_blocks <= expected_num_touched_blocks
|
|
else:
|
|
assert num_consumed_blocks == expected_num_touched_blocks
|