501 lines
18 KiB
Python
501 lines
18 KiB
Python
![]() |
import pytest
|
||
|
|
||
|
from vllm.core.block.block_table import BlockTable
|
||
|
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||
|
from vllm.utils import Device, cdiv, chunk_list
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [16])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
def test_allocate_naive(block_size: int, sequence_len: int):
|
||
|
"""Test the allocation of blocks using the naive allocator.
|
||
|
|
||
|
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||
|
number of blocks. It then allocates multiple BlockTables with varying
|
||
|
sequence lengths and verifies that the number of free blocks decreases as
|
||
|
expected after each allocation.
|
||
|
"""
|
||
|
assert block_size > 1
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type="naive",
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=1024,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||
|
|
||
|
block_tables = []
|
||
|
for i in range(5):
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
||
|
|
||
|
block_tables.append(
|
||
|
BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
))
|
||
|
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [16])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
||
|
"""Test the allocation of blocks using the prefix caching allocator.
|
||
|
|
||
|
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||
|
number of blocks, using the prefix caching allocator. It then allocates
|
||
|
multiple BlockTables with varying sequence lengths and verifies that the
|
||
|
number of free blocks decreases as expected after each allocation.
|
||
|
|
||
|
The test expects all sequences to share allocations, except for their last
|
||
|
block, which may be mutable. It calculates the expected number of immutable
|
||
|
and mutable blocks per allocation based on the sequence length and block
|
||
|
size.
|
||
|
"""
|
||
|
assert block_size > 1
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type="prefix_caching",
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=1024,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
chunked_tokens = list(chunk_list(token_ids, block_size))
|
||
|
num_mutable_blocks_per_alloc = 0 if len(
|
||
|
chunked_tokens[-1]) == block_size else 1
|
||
|
num_immutable_blocks_per_alloc = len(
|
||
|
chunked_tokens) - num_mutable_blocks_per_alloc
|
||
|
|
||
|
block_tables = []
|
||
|
for alloc_i in range(1, 6):
|
||
|
|
||
|
block_tables.append(
|
||
|
BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
))
|
||
|
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
# Expect all sequences to share allocations, except for their last block
|
||
|
# (which may be mutable).
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
device=Device.GPU) == num_gpu_blocks - (
|
||
|
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
|
||
|
(alloc_i))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [16])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
@pytest.mark.parametrize("device", ["cpu", "gpu"])
|
||
|
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
|
||
|
device: str):
|
||
|
"""Test the allocation and freeing of blocks using different allocators and
|
||
|
devices.
|
||
|
|
||
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||
|
number of blocks, allocator type, and device. It then allocates a BlockTable
|
||
|
multiple times with the same sequence and verifies that the number of free
|
||
|
blocks remains consistent after each allocation and freeing.
|
||
|
"""
|
||
|
device = Device[device.upper()]
|
||
|
|
||
|
num_device_blocks = 1024
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_device_blocks,
|
||
|
num_cpu_blocks=num_device_blocks,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||
|
|
||
|
block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
for i in range(5):
|
||
|
block_table.allocate(token_ids=token_ids, device=device)
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
device) == num_device_blocks - num_blocks_per_alloc
|
||
|
assert all(block_id is not None
|
||
|
for block_id in block_table.physical_block_ids)
|
||
|
|
||
|
block_table.free()
|
||
|
assert allocator.get_num_free_blocks(device) == num_device_blocks
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [1, 8])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
|
||
|
append_len: int, allocator_type: str):
|
||
|
"""Test the allocation behavior when appending token IDs to a BlockTable.
|
||
|
|
||
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||
|
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||
|
initial sequence and appends additional token IDs to it. The test verifies
|
||
|
that the number of allocated blocks before and after appending matches the
|
||
|
expected values.
|
||
|
"""
|
||
|
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=1024,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
token_ids_to_append = list(range(append_len))
|
||
|
|
||
|
block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
num_expected_blocks_before_append = len(
|
||
|
list(chunk_list(token_ids, block_size)))
|
||
|
num_expected_appended_blocks = len(
|
||
|
list(chunk_list(token_ids + token_ids_to_append,
|
||
|
block_size))) - num_expected_blocks_before_append
|
||
|
|
||
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
assert len(
|
||
|
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||
|
block_table.append_token_ids(token_ids_to_append)
|
||
|
assert len(
|
||
|
block_table.physical_block_ids
|
||
|
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [1, 8])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
|
||
|
num_empty_slots: int,
|
||
|
allocator_type: str):
|
||
|
"""Test the allocation behavior when ensuring a certain number of empty
|
||
|
slots in a BlockTable.
|
||
|
|
||
|
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||
|
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||
|
initial sequence and ensures a certain number of empty slots. The test
|
||
|
verifies that the number of allocated blocks before and after ensuring empty
|
||
|
slots matches the expected values. It also checks that filling up the empty
|
||
|
slots does not consume additional blocks.
|
||
|
"""
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=1024,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
|
||
|
block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
num_expected_blocks_before_append = len(
|
||
|
list(chunk_list(token_ids, block_size)))
|
||
|
num_expected_appended_blocks = len(
|
||
|
list(chunk_list(token_ids + [-1] * num_empty_slots,
|
||
|
block_size))) - num_expected_blocks_before_append
|
||
|
|
||
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
# Assert that the empty slots consume the expected number of additional
|
||
|
# blocks.
|
||
|
assert len(
|
||
|
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||
|
block_table.ensure_num_empty_slots(num_empty_slots)
|
||
|
assert len(
|
||
|
block_table.physical_block_ids
|
||
|
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||
|
|
||
|
# Now, ensure no additional blocks consumed as we fill up the empty slots.
|
||
|
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
|
||
|
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
|
||
|
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [1, 8])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 9])
|
||
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("append_size", [1, 4, 129])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
||
|
append_len: int, allocator_type: str,
|
||
|
append_size: int):
|
||
|
"""Verify token ids are correctly appended. Appends various amounts of
|
||
|
token ids in various append sizes, and verifies the final sequence is
|
||
|
correct.
|
||
|
"""
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=1024,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
token_ids_to_append = list(range(append_len))
|
||
|
|
||
|
block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
appended_so_far = []
|
||
|
for append in chunk_list(token_ids_to_append, append_size):
|
||
|
block_table.append_token_ids(append)
|
||
|
appended_so_far.extend(append)
|
||
|
|
||
|
assert block_table._get_all_token_ids() == token_ids + appended_so_far
|
||
|
|
||
|
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("seq_len", [1, 9, 129])
|
||
|
@pytest.mark.parametrize("block_size", [1, 8])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_fork(seq_len: int, block_size: int, allocator_type: str):
|
||
|
"""Create a sequence using the specified allocator.
|
||
|
1. Assert that after forking the sequence, the free block count is the
|
||
|
same.
|
||
|
2. Assert that the forked sequence has the same physical mappings.
|
||
|
3. Then free the original sequence; verify that the free block count is
|
||
|
the same.
|
||
|
4. Finally, free the forked sequence and verify that the free block
|
||
|
count drops to zero.
|
||
|
"""
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=0,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(seq_len))
|
||
|
|
||
|
block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
block_table.allocate(token_ids)
|
||
|
|
||
|
num_free_blocks_before_fork = allocator.get_num_free_blocks(
|
||
|
device=Device.GPU)
|
||
|
|
||
|
forked_block_table = block_table.fork()
|
||
|
|
||
|
# Expect physical_block_ids and token_ids to match.
|
||
|
assert (block_table.physical_block_ids ==
|
||
|
forked_block_table.physical_block_ids)
|
||
|
assert block_table._get_all_token_ids(
|
||
|
) == forked_block_table._get_all_token_ids()
|
||
|
|
||
|
# Do not expect any additional allocations.
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
device=Device.GPU) == num_free_blocks_before_fork
|
||
|
|
||
|
# Free the original blocks. Assert num free blocks does not change, since
|
||
|
# refcount is nonzero.
|
||
|
block_table.free()
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
device=Device.GPU) == num_free_blocks_before_fork
|
||
|
|
||
|
# Expect the forked block table to be unaffected by the free.
|
||
|
assert all(block_id is not None
|
||
|
for block_id in forked_block_table.physical_block_ids)
|
||
|
|
||
|
# Free the forked blocks. Assert num free blocks does change, since
|
||
|
# refcount is now zero.
|
||
|
forked_block_table.free()
|
||
|
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [8])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_cow(block_size: int, sequence_len: int, append_len: int,
|
||
|
allocator_type: str, appender: str):
|
||
|
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
|
||
|
"""
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=0,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
token_ids_to_append = list(range(append_len))
|
||
|
|
||
|
original_block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
|
||
|
num_expected_cow_blocks = cdiv(sequence_len + append_len,
|
||
|
block_size) - (sequence_len // block_size)
|
||
|
|
||
|
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
original_block_ids = original_block_table.physical_block_ids
|
||
|
|
||
|
forked_block_table = original_block_table.fork()
|
||
|
|
||
|
# Expect no additional allocation (copy on _write_).
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
|
||
|
|
||
|
if appender == "forked":
|
||
|
appender_block_table = forked_block_table
|
||
|
static_block_table = original_block_table
|
||
|
elif appender == "original":
|
||
|
appender_block_table = original_block_table
|
||
|
static_block_table = forked_block_table
|
||
|
else:
|
||
|
raise ValueError(f"unknown test config {appender=}")
|
||
|
|
||
|
# Write tokens.
|
||
|
appender_block_table.append_token_ids(token_ids_to_append)
|
||
|
|
||
|
# Expect the non-appending block table to have no change.
|
||
|
assert static_block_table.physical_block_ids == original_block_ids
|
||
|
assert appender_block_table.physical_block_ids != original_block_ids
|
||
|
|
||
|
# Expect the blocks changed during append to have a CoW.
|
||
|
assert allocator.get_num_free_blocks(
|
||
|
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
|
||
|
num_expected_cow_blocks)
|
||
|
|
||
|
cows = allocator.clear_copy_on_writes()
|
||
|
if sequence_len % block_size > 0:
|
||
|
# If the last block in the sequence is not full, then when appending we
|
||
|
# expect a CoW.
|
||
|
assert cows
|
||
|
|
||
|
cow_block_id = sequence_len // block_size
|
||
|
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||
|
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||
|
|
||
|
assert expected_src in cows
|
||
|
assert expected_dst in cows[expected_src]
|
||
|
else:
|
||
|
# Otherwise, there should be no copy-on-write.
|
||
|
assert not cows
|
||
|
|
||
|
static_block_table.free()
|
||
|
appender_block_table.free()
|
||
|
|
||
|
# After free, expect all blocks to be freed.
|
||
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("block_size", [8])
|
||
|
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
|
||
|
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||
|
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
||
|
append_len: int, lookahead_slots: int,
|
||
|
allocator_type: str, appender: str):
|
||
|
"""Similar to test_cow, except with lookahead allocation. The assertions are
|
||
|
less rigorous due to the complexity of the property under test.
|
||
|
"""
|
||
|
num_gpu_blocks = 1024
|
||
|
|
||
|
allocator = CpuGpuBlockAllocator.create(
|
||
|
allocator_type=allocator_type,
|
||
|
num_gpu_blocks=num_gpu_blocks,
|
||
|
num_cpu_blocks=0,
|
||
|
block_size=block_size,
|
||
|
)
|
||
|
|
||
|
token_ids = list(range(sequence_len))
|
||
|
token_ids_to_append = list(range(append_len))
|
||
|
|
||
|
original_block_table = BlockTable(
|
||
|
block_size=block_size,
|
||
|
block_allocator=allocator,
|
||
|
)
|
||
|
|
||
|
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||
|
|
||
|
# Allocate lookahead slots.
|
||
|
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
||
|
original_block_ids = original_block_table.physical_block_ids
|
||
|
|
||
|
forked_block_table = original_block_table.fork()
|
||
|
|
||
|
if appender == "forked":
|
||
|
appender_block_table = forked_block_table
|
||
|
static_block_table = original_block_table
|
||
|
elif appender == "original":
|
||
|
appender_block_table = original_block_table
|
||
|
static_block_table = forked_block_table
|
||
|
else:
|
||
|
raise ValueError(f"unknown test config {appender=}")
|
||
|
|
||
|
# Write tokens.
|
||
|
appender_block_table.append_token_ids(token_ids_to_append)
|
||
|
|
||
|
# Expect the non-appending block table to have no change.
|
||
|
assert static_block_table.physical_block_ids == original_block_ids
|
||
|
assert appender_block_table.physical_block_ids != original_block_ids
|
||
|
|
||
|
cows = allocator.clear_copy_on_writes()
|
||
|
|
||
|
# Always expect copy-on-write
|
||
|
assert cows
|
||
|
|
||
|
if sequence_len % block_size > 0:
|
||
|
# If the last block in the sequence is not full, then when appending we
|
||
|
# expect a CoW.
|
||
|
assert cows
|
||
|
|
||
|
cow_block_id = sequence_len // block_size
|
||
|
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||
|
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||
|
|
||
|
assert expected_src in cows
|
||
|
assert expected_dst in cows[expected_src]
|
||
|
|
||
|
static_block_table.free()
|
||
|
appender_block_table.free()
|
||
|
|
||
|
# After free, expect all blocks to be freed.
|
||
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|