[Core] Optimize block_manager_v2 vs block_manager_v1 (to make V2 default) (#5602)
This commit is contained in:
parent
54600709b6
commit
3476ed0809
@ -46,6 +46,7 @@ def main(args: argparse.Namespace):
|
|||||||
load_format=args.load_format,
|
load_format=args.load_format,
|
||||||
distributed_executor_backend=args.distributed_executor_backend,
|
distributed_executor_backend=args.distributed_executor_backend,
|
||||||
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||||
|
enable_prefix_caching=args.enable_prefix_caching,
|
||||||
)
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@ -220,6 +221,9 @@ if __name__ == '__main__':
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help='If True, the prefill requests can be chunked based on the '
|
help='If True, the prefill requests can be chunked based on the '
|
||||||
'max_num_batched_tokens')
|
'max_num_batched_tokens')
|
||||||
|
parser.add_argument("--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="Enable automatic prefix caching")
|
||||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
|
@ -474,7 +474,7 @@ class VllmRunner:
|
|||||||
req_sample_output_strs: List[str] = []
|
req_sample_output_strs: List[str] = []
|
||||||
for sample in req_output.outputs:
|
for sample in req_output.outputs:
|
||||||
output_str = sample.text
|
output_str = sample.text
|
||||||
output_ids = sample.token_ids
|
output_ids = list(sample.token_ids)
|
||||||
req_sample_output_ids.append(prompt_ids + output_ids)
|
req_sample_output_ids.append(prompt_ids + output_ids)
|
||||||
req_sample_output_strs.append(prompt_str + output_str)
|
req_sample_output_strs.append(prompt_str + output_str)
|
||||||
outputs.append((req_sample_output_ids, req_sample_output_strs))
|
outputs.append((req_sample_output_ids, req_sample_output_strs))
|
||||||
|
@ -373,8 +373,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
|
|||||||
block_size) - (sequence_len // block_size)
|
block_size) - (sequence_len // block_size)
|
||||||
|
|
||||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||||
original_block_ids = original_block_table.physical_block_ids
|
original_block_ids = original_block_table.physical_block_ids[:]
|
||||||
|
|
||||||
|
print("original_block_ids = {}".format(original_block_ids))
|
||||||
forked_block_table = original_block_table.fork()
|
forked_block_table = original_block_table.fork()
|
||||||
|
|
||||||
# Expect no additional allocation (copy on _write_).
|
# Expect no additional allocation (copy on _write_).
|
||||||
@ -457,7 +458,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
|||||||
|
|
||||||
# Allocate lookahead slots.
|
# Allocate lookahead slots.
|
||||||
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
||||||
original_block_ids = original_block_table.physical_block_ids
|
original_block_ids = original_block_table.physical_block_ids[:]
|
||||||
|
|
||||||
forked_block_table = original_block_table.fork()
|
forked_block_table = original_block_table.fork()
|
||||||
|
|
||||||
|
@ -8,8 +8,8 @@ from vllm.utils import Device, chunk_list
|
|||||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||||
@pytest.mark.parametrize("block_size", [16])
|
@pytest.mark.parametrize("block_size", [16])
|
||||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||||
def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
|
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||||
block_size: int, allocator_type: str):
|
block_size: int, allocator_type: str):
|
||||||
allocator = CpuGpuBlockAllocator.create(
|
allocator = CpuGpuBlockAllocator.create(
|
||||||
allocator_type=allocator_type,
|
allocator_type=allocator_type,
|
||||||
num_gpu_blocks=num_gpu_blocks,
|
num_gpu_blocks=num_gpu_blocks,
|
||||||
@ -21,14 +21,14 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
|
|||||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||||
|
|
||||||
cpu_blocks = [
|
cpu_blocks = [
|
||||||
allocator.allocate_mutable(prev_block=None, device=Device.CPU)
|
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
|
||||||
for _ in range(num_cpu_blocks)
|
for _ in range(num_cpu_blocks)
|
||||||
]
|
]
|
||||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||||
|
|
||||||
gpu_blocks = [
|
gpu_blocks = [
|
||||||
allocator.allocate_mutable(prev_block=None, device=Device.GPU)
|
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
|
||||||
for _ in range(num_gpu_blocks)
|
for _ in range(num_gpu_blocks)
|
||||||
]
|
]
|
||||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||||
@ -47,8 +47,8 @@ def test_allocate_mutable(num_cpu_blocks: int, num_gpu_blocks: int,
|
|||||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||||
@pytest.mark.parametrize("block_size", [2])
|
@pytest.mark.parametrize("block_size", [2])
|
||||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||||
def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
|
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||||
block_size: int, allocator_type: str):
|
block_size: int, allocator_type: str):
|
||||||
allocator = CpuGpuBlockAllocator.create(
|
allocator = CpuGpuBlockAllocator.create(
|
||||||
allocator_type=allocator_type,
|
allocator_type=allocator_type,
|
||||||
num_gpu_blocks=num_gpu_blocks,
|
num_gpu_blocks=num_gpu_blocks,
|
||||||
@ -67,18 +67,18 @@ def test_allocate_immutable(num_cpu_blocks: int, num_gpu_blocks: int,
|
|||||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||||
|
|
||||||
cpu_blocks = [
|
cpu_blocks = [
|
||||||
allocator.allocate_immutable(prev_block=None,
|
allocator.allocate_immutable_block(prev_block=None,
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
device=Device.CPU)
|
device=Device.CPU)
|
||||||
for token_ids in cpu_token_ids
|
for token_ids in cpu_token_ids
|
||||||
]
|
]
|
||||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||||
|
|
||||||
gpu_blocks = [
|
gpu_blocks = [
|
||||||
allocator.allocate_immutable(prev_block=None,
|
allocator.allocate_immutable_block(prev_block=None,
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
device=Device.GPU)
|
device=Device.GPU)
|
||||||
for token_ids in gpu_token_ids
|
for token_ids in gpu_token_ids
|
||||||
]
|
]
|
||||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||||
|
@ -14,11 +14,11 @@ class TestNaiveBlockAllocator:
|
|||||||
prev_block: Optional[Block],
|
prev_block: Optional[Block],
|
||||||
token_ids: List[int]):
|
token_ids: List[int]):
|
||||||
if allocate_type == "immutable":
|
if allocate_type == "immutable":
|
||||||
allocate_block = lambda: allocator.allocate_immutable(
|
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||||
prev_block=prev_block, token_ids=token_ids)
|
prev_block=prev_block, token_ids=token_ids)
|
||||||
elif allocate_type == "mutable":
|
elif allocate_type == "mutable":
|
||||||
allocate_block = lambda: allocator.allocate_mutable(prev_block=
|
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||||
prev_block)
|
prev_block=prev_block)
|
||||||
else:
|
else:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
|
@ -26,11 +26,10 @@ class TestPrefixCachingBlock:
|
|||||||
token_ids = list(range(num_to_fill))
|
token_ids = list(range(num_to_fill))
|
||||||
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||||
|
|
||||||
block_with_prev = PrefixCachingBlock(
|
block_with_prev = PrefixCachingBlock(prev_block=None,
|
||||||
prev_block=None,
|
token_ids=token_ids,
|
||||||
token_ids=token_ids,
|
block_size=block_size,
|
||||||
block_size=block_size,
|
allocator=mock_allocator)
|
||||||
prefix_caching_allocator=mock_allocator)
|
|
||||||
|
|
||||||
if is_curr_block_full:
|
if is_curr_block_full:
|
||||||
# Expect hash since block is full.
|
# Expect hash since block is full.
|
||||||
@ -71,7 +70,7 @@ class TestPrefixCachingBlock:
|
|||||||
prev_block=previous_block,
|
prev_block=previous_block,
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
prefix_caching_allocator=mock_allocator,
|
allocator=mock_allocator,
|
||||||
)
|
)
|
||||||
|
|
||||||
if is_curr_block_full and prev_block_has_hash:
|
if is_curr_block_full and prev_block_has_hash:
|
||||||
@ -138,7 +137,7 @@ class TestPrefixCachingBlock:
|
|||||||
prev_block=prev_block,
|
prev_block=prev_block,
|
||||||
token_ids=[],
|
token_ids=[],
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
prefix_caching_allocator=allocator,
|
allocator=allocator,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokens_to_append = token_ids[block_number *
|
tokens_to_append = token_ids[block_number *
|
||||||
@ -159,11 +158,11 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
prev_block: Optional[Block],
|
prev_block: Optional[Block],
|
||||||
token_ids: List[int]):
|
token_ids: List[int]):
|
||||||
if allocate_type == "immutable":
|
if allocate_type == "immutable":
|
||||||
allocate_block = lambda: allocator.allocate_immutable(
|
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||||
prev_block=prev_block, token_ids=token_ids)
|
prev_block=prev_block, token_ids=token_ids)
|
||||||
elif allocate_type == "mutable":
|
elif allocate_type == "mutable":
|
||||||
allocate_block = lambda: allocator.allocate_mutable(prev_block=
|
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||||
prev_block)
|
prev_block=prev_block)
|
||||||
else:
|
else:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
@ -233,12 +232,13 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
|
|
||||||
# Expect allocation with unseen hash to fail.
|
# Expect allocation with unseen hash to fail.
|
||||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||||
allocator.allocate_immutable(prev_block=chain[-1],
|
allocator.allocate_immutable_block(prev_block=chain[-1],
|
||||||
token_ids=list(range(block_size)))
|
token_ids=list(
|
||||||
|
range(block_size)))
|
||||||
|
|
||||||
# Expect mutable allocation to fail.
|
# Expect mutable allocation to fail.
|
||||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||||
allocator.allocate_mutable(prev_block=chain[-1])
|
allocator.allocate_mutable_block(prev_block=chain[-1])
|
||||||
|
|
||||||
# Expect allocation of exact same chain to pass.
|
# Expect allocation of exact same chain to pass.
|
||||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||||
@ -270,7 +270,7 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
|
|
||||||
# Expect mutable allocation to fail.
|
# Expect mutable allocation to fail.
|
||||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||||
allocator.allocate_mutable(prev_block=None)
|
allocator.allocate_mutable_block(prev_block=None)
|
||||||
|
|
||||||
block_to_free = chain[-1]
|
block_to_free = chain[-1]
|
||||||
|
|
||||||
@ -280,11 +280,11 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
allocator.free(block_to_free)
|
allocator.free(block_to_free)
|
||||||
assert block_to_free.block_id is None, i
|
assert block_to_free.block_id is None, i
|
||||||
|
|
||||||
new_block = allocator.allocate_mutable(prev_block=None)
|
new_block = allocator.allocate_mutable_block(prev_block=None)
|
||||||
assert new_block.block_id == block_id, i
|
assert new_block.block_id == block_id, i
|
||||||
|
|
||||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||||
allocator.allocate_mutable(prev_block=None)
|
allocator.allocate_mutable_block(prev_block=None)
|
||||||
|
|
||||||
block_to_free = new_block
|
block_to_free = new_block
|
||||||
|
|
||||||
@ -376,7 +376,6 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
|
|
||||||
# Create token ids that will exhaust all blocks.
|
# Create token ids that will exhaust all blocks.
|
||||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||||
blocks = list(range(num_blocks_to_consume))
|
|
||||||
|
|
||||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
@ -384,9 +383,6 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
allocator=allocator,
|
allocator=allocator,
|
||||||
)
|
)
|
||||||
|
|
||||||
# mark all blocks in first chain as computed
|
|
||||||
allocator.mark_blocks_as_computed(blocks)
|
|
||||||
|
|
||||||
# After zero_point, second_chain's token_ids would be set -1, which
|
# After zero_point, second_chain's token_ids would be set -1, which
|
||||||
# make it different from here comparing with first_chain
|
# make it different from here comparing with first_chain
|
||||||
zero_point = random.randint(1, len(token_ids) - 1)
|
zero_point = random.randint(1, len(token_ids) - 1)
|
||||||
@ -424,15 +420,16 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
block_size=block_size)
|
block_size=block_size)
|
||||||
token_ids = list(range(block_size))
|
token_ids = list(range(block_size))
|
||||||
|
|
||||||
block = allocator.allocate_immutable(prev_block=None,
|
block = allocator.allocate_immutable_block(prev_block=None,
|
||||||
token_ids=token_ids)
|
token_ids=token_ids)
|
||||||
|
|
||||||
assert allocator._refcounter.get(block.block_id) == 1
|
assert allocator._refcounter.get(block.block_id) == 1
|
||||||
m = allocator.allocate_mutable(prev_block=None)
|
m = allocator.allocate_mutable_block(prev_block=None)
|
||||||
|
|
||||||
block_id = m.block_id
|
block_id = m.block_id
|
||||||
for i in range(block_size):
|
for i in range(block_size):
|
||||||
m.append_token_ids([i])
|
m.append_token_ids([i])
|
||||||
|
|
||||||
# After block get promoted to immutable from mutable, if there is
|
# After block get promoted to immutable from mutable, if there is
|
||||||
# already same content hash block, then it shall be released into
|
# already same content hash block, then it shall be released into
|
||||||
# hashless_allocator
|
# hashless_allocator
|
||||||
@ -452,48 +449,79 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
|
|
||||||
all_blocks_list = [i for i in range(num_blocks)]
|
all_blocks_list = [i for i in range(num_blocks)]
|
||||||
zero_ref = {i: 0 for i in range(num_blocks)}
|
zero_ref = {i: 0 for i in range(num_blocks)}
|
||||||
|
one_ref = {i: 1 for i in range(num_blocks)}
|
||||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||||
block_size=block_size)
|
block_size=block_size)
|
||||||
token_ids = list(range(num_blocks * block_size))
|
token_ids = list(range(num_blocks * block_size))
|
||||||
|
|
||||||
# now we have num_blocks free blocks in hashless allocator
|
# Verify initial/pre-alloc state
|
||||||
# with internal tracking list _blocks _cached_blocks and evictor
|
|
||||||
# empty and block's ref shall be 0
|
# Ensure all blocks are free inside hashless allocator
|
||||||
assert list(allocator._hashless_allocator._free_block_indices
|
assert list(allocator._hashless_allocator._free_block_indices
|
||||||
) == all_blocks_list
|
) == all_blocks_list
|
||||||
assert len(allocator._blocks.keys()) == 0
|
# Ensure no tracked blocks
|
||||||
|
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||||
|
for block_id in range(num_blocks):
|
||||||
|
assert not allocator._block_tracker[block_id].active
|
||||||
|
# Ensure no cached blocks
|
||||||
assert len(allocator._cached_blocks.values()) == 0
|
assert len(allocator._cached_blocks.values()) == 0
|
||||||
|
# Ensure no evicted blocks
|
||||||
assert len(allocator.evictor.free_table.keys()) == 0
|
assert len(allocator.evictor.free_table.keys()) == 0
|
||||||
|
# Ensure 0s ref counts for all blocks
|
||||||
assert allocator._refcounter._refcounts == zero_ref
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
|
|
||||||
# Allocate immutable chains with only one block residuled in
|
# Allocate immutable chains with only one block residuled in
|
||||||
new_block = []
|
new_block = []
|
||||||
for i in range(num_blocks):
|
for i in range(num_blocks):
|
||||||
block = allocator.allocate_immutable(
|
block = allocator.allocate_immutable_block(
|
||||||
prev_block=None,
|
prev_block=None,
|
||||||
token_ids=token_ids[block_size * i:block_size * (i + 1)])
|
token_ids=token_ids[block_size * i:block_size * (i + 1)])
|
||||||
new_block.append(block)
|
new_block.append(block)
|
||||||
|
|
||||||
|
# Verify post-alloc state
|
||||||
|
|
||||||
|
# Ensure no blocks are free inside hashless allocator
|
||||||
|
assert (len(allocator._hashless_allocator._free_block_indices) == 0)
|
||||||
|
# Ensure all blocks are tracked
|
||||||
|
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||||
|
for block_id in range(num_blocks):
|
||||||
|
assert allocator._block_tracker[block_id].active
|
||||||
|
# Ensure all blocks are cached (all promoted)
|
||||||
|
assert len(allocator._cached_blocks.values()) == num_blocks
|
||||||
|
# Ensure no evicted blocks
|
||||||
|
assert len(allocator.evictor.free_table.keys()) == 0
|
||||||
|
# Ensure 1s ref counts for all blocks
|
||||||
|
assert allocator._refcounter._refcounts == one_ref
|
||||||
|
|
||||||
# Free all blocks, and now all blocks shall be in the evictor
|
# Free all blocks, and now all blocks shall be in the evictor
|
||||||
# there shall be no tracking data left in _blocks
|
# there shall be no tracking data left in _block_tracker
|
||||||
# all blocks shall be tracked in _cached_blocks
|
# all blocks shall be tracked in _cached_blocks
|
||||||
# all blocks' ref shall be zero
|
# all blocks' ref shall be zero
|
||||||
for block in new_block:
|
for block in new_block:
|
||||||
allocator.free(block)
|
allocator.free(block)
|
||||||
|
|
||||||
assert len(allocator._blocks.keys()) == 0
|
# Verify post-free state
|
||||||
|
|
||||||
|
# Ensure no tracked blocks
|
||||||
|
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||||
|
for block_id in range(num_blocks):
|
||||||
|
assert not allocator._block_tracker[block_id].active
|
||||||
|
# Ensure no blocks in hashless allocator (all promoted)
|
||||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
|
# Ensure all blocks are cached
|
||||||
assert list(allocator._cached_blocks.values()) == all_blocks_list
|
assert list(allocator._cached_blocks.values()) == all_blocks_list
|
||||||
|
# Ensure all blocks are inside the evictor
|
||||||
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
|
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
|
||||||
|
# Ensure 0s refcounts
|
||||||
assert allocator._refcounter._refcounts == zero_ref
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
|
|
||||||
# Allocate a mutable block, and the first block shall be evicted
|
# Allocate a mutable block, and the first block shall be evicted
|
||||||
# and set its content hash into None, ref to 1
|
# and set its content hash into None, ref to 1
|
||||||
mutable = allocator.allocate_mutable(prev_block=None)
|
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||||
|
|
||||||
assert mutable.block_id == 0
|
assert mutable.block_id == 0
|
||||||
assert mutable.content_hash is None
|
assert mutable.content_hash is None
|
||||||
assert 0 in allocator._blocks
|
assert allocator._block_tracker[0].active
|
||||||
assert allocator._refcounter.get(0) == 1
|
assert allocator._refcounter.get(0) == 1
|
||||||
assert 0 not in allocator._cached_blocks
|
assert 0 not in allocator._cached_blocks
|
||||||
assert 0 not in allocator.evictor
|
assert 0 not in allocator.evictor
|
||||||
@ -502,27 +530,27 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
# hashless allocator
|
# hashless allocator
|
||||||
allocator.free(mutable)
|
allocator.free(mutable)
|
||||||
|
|
||||||
assert len(allocator._blocks.keys()) == 0
|
assert not allocator._block_tracker[0].active
|
||||||
assert allocator._refcounter._refcounts == zero_ref
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
assert 0 not in allocator._cached_blocks
|
assert 0 not in allocator._cached_blocks
|
||||||
assert 0 not in allocator.evictor
|
assert 0 not in allocator.evictor
|
||||||
assert 0 in allocator._hashless_allocator._free_block_indices
|
assert 0 in allocator._hashless_allocator._free_block_indices
|
||||||
|
|
||||||
# when allocate immutable with first block_size tokens, we
|
# When allocate immutable with first block_size tokens, we
|
||||||
# shall get free block from hashless allocator, thus no block left
|
# shall get free block from hashless allocator, thus no block left
|
||||||
# in hashless
|
# in hashless
|
||||||
block = allocator.allocate_immutable(prev_block=None,
|
block = allocator.allocate_immutable_block(
|
||||||
token_ids=token_ids[:block_size])
|
prev_block=None, token_ids=token_ids[:block_size])
|
||||||
|
|
||||||
assert block.block_id == 0
|
assert block.block_id == 0
|
||||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
assert 0 in allocator._blocks
|
assert allocator._block_tracker[0].active
|
||||||
assert 0 in allocator._cached_blocks.values()
|
assert 0 in allocator._cached_blocks.values()
|
||||||
assert allocator._refcounter.get(0) == 1
|
assert allocator._refcounter.get(0) == 1
|
||||||
assert 0 not in allocator.evictor
|
assert 0 not in allocator.evictor
|
||||||
|
|
||||||
# allocate mutable block again, it shall be popped from evictor
|
# allocate mutable block again, it shall be popped from evictor
|
||||||
mutable = allocator.allocate_mutable(prev_block=None)
|
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
assert mutable.block_id not in allocator.evictor.free_table
|
assert mutable.block_id not in allocator.evictor.free_table
|
||||||
assert allocator._refcounter.get(mutable.block_id) == 1
|
assert allocator._refcounter.get(mutable.block_id) == 1
|
||||||
@ -619,7 +647,7 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
block_token_ids = token_ids[block_number *
|
block_token_ids = token_ids[block_number *
|
||||||
block_size:(block_number + 1) *
|
block_size:(block_number + 1) *
|
||||||
block_size]
|
block_size]
|
||||||
prev_block = allocator.allocate_immutable(
|
prev_block = allocator.allocate_immutable_block(
|
||||||
prev_block=prev_block, token_ids=block_token_ids)
|
prev_block=prev_block, token_ids=block_token_ids)
|
||||||
blocks.append(prev_block)
|
blocks.append(prev_block)
|
||||||
|
|
||||||
|
@ -90,10 +90,10 @@ def test_create_single_target_seq_group_metadata(k: int):
|
|||||||
|
|
||||||
assert output.request_id == input_seq_group_metadata.request_id
|
assert output.request_id == input_seq_group_metadata.request_id
|
||||||
assert len(output.seq_data) == 1
|
assert len(output.seq_data) == 1
|
||||||
assert output.seq_data[target_seq_id].get_prompt_token_ids(
|
assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
|
||||||
) == prompt_tokens
|
prompt_tokens)
|
||||||
assert output.seq_data[target_seq_id].get_output_token_ids(
|
assert output.seq_data[target_seq_id].get_output_token_ids() == tuple(
|
||||||
) == prev_output_tokens + token_ids
|
prev_output_tokens + token_ids)
|
||||||
|
|
||||||
assert len(output.block_tables) == 1
|
assert len(output.block_tables) == 1
|
||||||
assert output.block_tables[
|
assert output.block_tables[
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from vllm.core.block.common import BlockList
|
||||||
from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
|
from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
|
||||||
from vllm.utils import Device, cdiv, chunk_list
|
from vllm.utils import Device, cdiv, chunk_list
|
||||||
|
|
||||||
@ -47,12 +48,10 @@ class BlockTable:
|
|||||||
self._allocator = block_allocator
|
self._allocator = block_allocator
|
||||||
if _blocks is None:
|
if _blocks is None:
|
||||||
_blocks = []
|
_blocks = []
|
||||||
self._blocks: List[Block] = _blocks
|
self._blocks: BlockList = BlockList(_blocks)
|
||||||
|
|
||||||
self._max_block_sliding_window = max_block_sliding_window
|
self._max_block_sliding_window = max_block_sliding_window
|
||||||
# Use helper method instead of directly calculating, as blocks
|
self._num_full_slots = self._get_num_token_ids()
|
||||||
# may not be allocated.
|
|
||||||
self._num_full_slots = len(self._get_all_token_ids())
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
|
def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
|
||||||
@ -88,11 +87,18 @@ class BlockTable:
|
|||||||
"""
|
"""
|
||||||
assert not self._is_allocated
|
assert not self._is_allocated
|
||||||
assert token_ids
|
assert token_ids
|
||||||
self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
|
blocks = self._allocate_blocks_for_token_ids(prev_block=None,
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
device=device)
|
device=device)
|
||||||
|
self.update(blocks)
|
||||||
self._num_full_slots = len(token_ids)
|
self._num_full_slots = len(token_ids)
|
||||||
|
|
||||||
|
def update(self, blocks: List[Block]) -> None:
|
||||||
|
"""Resets the table to the newly provided blocks
|
||||||
|
(with their corresponding block ids)
|
||||||
|
"""
|
||||||
|
self._blocks.update(blocks)
|
||||||
|
|
||||||
def append_token_ids(self,
|
def append_token_ids(self,
|
||||||
token_ids: List[int],
|
token_ids: List[int],
|
||||||
num_lookahead_slots: int = 0,
|
num_lookahead_slots: int = 0,
|
||||||
@ -140,11 +146,11 @@ class BlockTable:
|
|||||||
num_lookahead_slots)
|
num_lookahead_slots)
|
||||||
|
|
||||||
# Update the blocks with the new tokens
|
# Update the blocks with the new tokens
|
||||||
blocks = self._blocks[self._num_full_slots // self._block_size:]
|
first_block_idx = self._num_full_slots // self._block_size
|
||||||
token_blocks = self._chunk_token_blocks_for_append(token_ids)
|
token_blocks = self._chunk_token_blocks_for_append(token_ids)
|
||||||
|
|
||||||
for block, token_block in zip(blocks, token_blocks):
|
for i, token_block in enumerate(token_blocks):
|
||||||
block.append_token_ids(token_block)
|
self._blocks.append_token_ids(first_block_idx + i, token_block)
|
||||||
|
|
||||||
self._num_full_slots += len(token_ids)
|
self._num_full_slots += len(token_ids)
|
||||||
|
|
||||||
@ -174,8 +180,8 @@ class BlockTable:
|
|||||||
for _ in range(blocks_to_allocate):
|
for _ in range(blocks_to_allocate):
|
||||||
assert len(self._blocks) > 0
|
assert len(self._blocks) > 0
|
||||||
self._blocks.append(
|
self._blocks.append(
|
||||||
self._allocator.allocate_mutable(prev_block=self._blocks[-1],
|
self._allocator.allocate_mutable_block(
|
||||||
device=device))
|
prev_block=self._blocks[-1], device=device))
|
||||||
|
|
||||||
def fork(self) -> "BlockTable":
|
def fork(self) -> "BlockTable":
|
||||||
"""Creates a new BlockTable instance with a copy of the blocks from the
|
"""Creates a new BlockTable instance with a copy of the blocks from the
|
||||||
@ -209,12 +215,12 @@ class BlockTable:
|
|||||||
is set to `None`.
|
is set to `None`.
|
||||||
"""
|
"""
|
||||||
assert self._is_allocated
|
assert self._is_allocated
|
||||||
for block in self._blocks:
|
for block in self.blocks:
|
||||||
self._allocator.free(block)
|
self._allocator.free(block)
|
||||||
self._blocks = []
|
self._blocks.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def physical_block_ids(self) -> List[Optional[int]]:
|
def physical_block_ids(self) -> List[int]:
|
||||||
"""Returns a list of physical block indices for the blocks in the
|
"""Returns a list of physical block indices for the blocks in the
|
||||||
BlockTable.
|
BlockTable.
|
||||||
|
|
||||||
@ -228,7 +234,7 @@ class BlockTable:
|
|||||||
BlockTable.
|
BlockTable.
|
||||||
"""
|
"""
|
||||||
assert self._is_allocated
|
assert self._is_allocated
|
||||||
return [block.block_id for block in self._blocks]
|
return self._blocks.ids()
|
||||||
|
|
||||||
def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
|
def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
|
||||||
"""Get the number of "unseen" tokens in the sequence.
|
"""Get the number of "unseen" tokens in the sequence.
|
||||||
@ -253,17 +259,31 @@ class BlockTable:
|
|||||||
token_ids: List[int],
|
token_ids: List[int],
|
||||||
device: Device) -> List[Block]:
|
device: Device) -> List[Block]:
|
||||||
blocks: List[Block] = []
|
blocks: List[Block] = []
|
||||||
for block_token_ids in chunk_list(token_ids, self._block_size):
|
|
||||||
if len(block_token_ids) == self._block_size:
|
block_token_ids = []
|
||||||
# If the block is full, create an immutable block.
|
tail_token_ids = []
|
||||||
prev_block = self._allocator.allocate_immutable(
|
for cur_token_ids in chunk_list(token_ids, self._block_size):
|
||||||
prev_block, token_ids=block_token_ids, device=device)
|
if len(cur_token_ids) == self._block_size:
|
||||||
|
block_token_ids.append(cur_token_ids)
|
||||||
else:
|
else:
|
||||||
# Else, partially fill a mutable block with token ids.
|
tail_token_ids.append(cur_token_ids)
|
||||||
prev_block = self._allocator.allocate_mutable(
|
|
||||||
prev_block=prev_block, device=device)
|
if block_token_ids:
|
||||||
prev_block.append_token_ids(block_token_ids)
|
blocks.extend(
|
||||||
blocks.append(prev_block)
|
self._allocator.allocate_immutable_blocks(
|
||||||
|
prev_block, block_token_ids=block_token_ids,
|
||||||
|
device=device))
|
||||||
|
prev_block = blocks[-1]
|
||||||
|
|
||||||
|
if tail_token_ids:
|
||||||
|
assert len(tail_token_ids) == 1
|
||||||
|
cur_token_ids = tail_token_ids[0]
|
||||||
|
|
||||||
|
block = self._allocator.allocate_mutable_block(
|
||||||
|
prev_block=prev_block, device=device)
|
||||||
|
block.append_token_ids(cur_token_ids)
|
||||||
|
|
||||||
|
blocks.append(block)
|
||||||
|
|
||||||
return blocks
|
return blocks
|
||||||
|
|
||||||
@ -274,18 +294,25 @@ class BlockTable:
|
|||||||
if not self._is_allocated:
|
if not self._is_allocated:
|
||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
for block in self._blocks:
|
for block in self.blocks:
|
||||||
token_ids.extend(block.token_ids)
|
token_ids.extend(block.token_ids)
|
||||||
|
|
||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
|
def _get_num_token_ids(self) -> int:
|
||||||
|
res = 0
|
||||||
|
for block in self.blocks:
|
||||||
|
res += len(block.token_ids)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _is_allocated(self) -> bool:
|
def _is_allocated(self) -> bool:
|
||||||
return len(self._blocks) > 0
|
return len(self._blocks) > 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def blocks(self) -> Optional[List[Block]]:
|
def blocks(self) -> List[Block]:
|
||||||
return self._blocks
|
return self._blocks.list()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _num_empty_slots(self) -> int:
|
def _num_empty_slots(self) -> int:
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from typing import Dict, Iterable, List, Optional, Protocol, Tuple
|
from collections import deque
|
||||||
|
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
|
||||||
|
|
||||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||||
|
|
||||||
@ -95,64 +96,40 @@ class CopyOnWriteTracker:
|
|||||||
|
|
||||||
The CopyOnWriteTracker class maintains a mapping of source block indices to
|
The CopyOnWriteTracker class maintains a mapping of source block indices to
|
||||||
their corresponding copy-on-write destination block indices. It works in
|
their corresponding copy-on-write destination block indices. It works in
|
||||||
conjunction with a RefCounter and a BlockAllocator to handle reference
|
conjunction with a RefCounter.
|
||||||
counting and block allocation.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
refcounter (RefCounter): The reference counter used to track block
|
refcounter (RefCounter): The reference counter used to track block
|
||||||
reference counts.
|
reference counts.
|
||||||
allocator (BlockAllocator): The block allocator used to allocate and
|
|
||||||
free blocks.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, refcounter: RefCounterProtocol):
|
||||||
self,
|
|
||||||
refcounter: RefCounterProtocol,
|
|
||||||
allocator: BlockAllocator,
|
|
||||||
):
|
|
||||||
self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
|
self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
|
||||||
self._refcounter = refcounter
|
self._refcounter = refcounter
|
||||||
self._allocator = allocator
|
|
||||||
|
|
||||||
def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
|
def is_appendable(self, block: Block) -> bool:
|
||||||
"""Performs a copy-on-write operation on the given block if it is not
|
"""Checks if the block is shared or not. If shared, then it cannot
|
||||||
appendable.
|
be appended and needs to be duplicated via copy-on-write
|
||||||
|
|
||||||
This method checks the reference count of the given block. If the
|
|
||||||
reference count is greater than 1, indicating that the block is shared,
|
|
||||||
a copy-on-write operation is performed. The original block is freed,
|
|
||||||
and a new block is allocated with the same content. The new block index
|
|
||||||
is returned.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
block (Block): The block to check for copy-on-write.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional[BlockId]: The block index of the new block if a copy-on
|
|
||||||
-write operation was performed, or the original block index if
|
|
||||||
no copy-on-write was necessary.
|
|
||||||
"""
|
"""
|
||||||
block_id = block.block_id
|
block_id = block.block_id
|
||||||
if block_id is None:
|
if block_id is None:
|
||||||
return block_id
|
return True
|
||||||
|
|
||||||
refcount = self._refcounter.get(block_id)
|
refcount = self._refcounter.get(block_id)
|
||||||
assert refcount != 0
|
return refcount <= 1
|
||||||
if refcount > 1:
|
|
||||||
src_block_id = block_id
|
|
||||||
# Decrement refcount of the old block.
|
|
||||||
self._allocator.free(block)
|
|
||||||
|
|
||||||
# Allocate a fresh new block.
|
def record_cow(self, src_block_id: Optional[BlockId],
|
||||||
block_id = self._allocator.allocate_mutable(
|
trg_block_id: Optional[BlockId]) -> None:
|
||||||
prev_block=block.prev_block).block_id
|
"""Records a copy-on-write operation from source to target block id
|
||||||
|
Args:
|
||||||
# Track src/dst copy.
|
src_block_id (BlockId): The source block id from which to copy
|
||||||
assert src_block_id is not None
|
the data
|
||||||
assert block_id is not None
|
trg_block_id (BlockId): The target block id to which the data
|
||||||
self._copy_on_writes.append((src_block_id, block_id))
|
is copied
|
||||||
|
"""
|
||||||
return block_id
|
assert src_block_id is not None
|
||||||
|
assert trg_block_id is not None
|
||||||
|
self._copy_on_writes.append((src_block_id, trg_block_id))
|
||||||
|
|
||||||
def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
|
def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
|
||||||
"""Clears the copy-on-write tracking information and returns the current
|
"""Clears the copy-on-write tracking information and returns the current
|
||||||
@ -172,6 +149,139 @@ class CopyOnWriteTracker:
|
|||||||
return cows
|
return cows
|
||||||
|
|
||||||
|
|
||||||
|
class BlockPool:
|
||||||
|
"""Used to pre-allocate block objects, in order to avoid excessive python
|
||||||
|
object allocations/deallocations.
|
||||||
|
The pool starts from "pool_size" objects and will increase to more objects
|
||||||
|
if necessary
|
||||||
|
|
||||||
|
Note that multiple block objects may point to the same physical block id,
|
||||||
|
which is why this pool is needed, so that it will be easier to support
|
||||||
|
prefix caching and more complicated sharing of physical blocks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, block_size: int, create_block: Block.Factory,
|
||||||
|
allocator: BlockAllocator, pool_size: int):
|
||||||
|
self._block_size = block_size
|
||||||
|
self._create_block = create_block
|
||||||
|
self._allocator = allocator
|
||||||
|
self._pool_size = pool_size
|
||||||
|
assert self._pool_size >= 0
|
||||||
|
|
||||||
|
self._free_ids: Deque[int] = deque(range(self._pool_size))
|
||||||
|
self._pool = []
|
||||||
|
for i in range(self._pool_size):
|
||||||
|
self._pool.append(
|
||||||
|
self._create_block(prev_block=None,
|
||||||
|
token_ids=[],
|
||||||
|
block_size=self._block_size,
|
||||||
|
allocator=self._allocator,
|
||||||
|
block_id=None))
|
||||||
|
|
||||||
|
def increase_pool(self):
|
||||||
|
"""Doubles the internal pool size
|
||||||
|
"""
|
||||||
|
cur_pool_size = self._pool_size
|
||||||
|
new_pool_size = cur_pool_size * 2
|
||||||
|
self._pool_size = new_pool_size
|
||||||
|
|
||||||
|
self._free_ids += deque(range(cur_pool_size, new_pool_size))
|
||||||
|
|
||||||
|
for i in range(cur_pool_size, new_pool_size):
|
||||||
|
self._pool.append(
|
||||||
|
self._create_block(prev_block=None,
|
||||||
|
token_ids=[],
|
||||||
|
block_size=self._block_size,
|
||||||
|
allocator=self._allocator,
|
||||||
|
block_id=None))
|
||||||
|
|
||||||
|
def init_block(self, prev_block: Optional[Block], token_ids: List[int],
|
||||||
|
block_size: int, physical_block_id: Optional[int]) -> Block:
|
||||||
|
if len(self._free_ids) == 0:
|
||||||
|
self.increase_pool()
|
||||||
|
assert len(self._free_ids) > 0
|
||||||
|
|
||||||
|
pool_id = self._free_ids.popleft()
|
||||||
|
|
||||||
|
block = self._pool[pool_id]
|
||||||
|
block.__init__( # type: ignore[misc]
|
||||||
|
prev_block=prev_block,
|
||||||
|
token_ids=token_ids,
|
||||||
|
block_size=block_size,
|
||||||
|
allocator=block._allocator, # type: ignore[attr-defined]
|
||||||
|
block_id=physical_block_id)
|
||||||
|
block.pool_id = pool_id # type: ignore[attr-defined]
|
||||||
|
return block
|
||||||
|
|
||||||
|
def free_block(self, block: Block) -> None:
|
||||||
|
self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
|
||||||
|
class BlockList:
|
||||||
|
"""This class is an optimization to allow fast-access to physical
|
||||||
|
block ids. It maintains a block id list that is updated with the
|
||||||
|
block list and this avoids the need to reconstruct the block id
|
||||||
|
list on every iteration of the block manager
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, blocks: List[Block]):
|
||||||
|
self._blocks: List[Block] = []
|
||||||
|
self._block_ids: List[int] = []
|
||||||
|
|
||||||
|
self.update(blocks)
|
||||||
|
|
||||||
|
def _add_block_id(self, block_id: Optional[BlockId]) -> None:
|
||||||
|
assert block_id is not None
|
||||||
|
self._block_ids.append(block_id)
|
||||||
|
|
||||||
|
def _update_block_id(self, block_index: int,
|
||||||
|
new_block_id: Optional[BlockId]) -> None:
|
||||||
|
assert new_block_id is not None
|
||||||
|
self._block_ids[block_index] = new_block_id
|
||||||
|
|
||||||
|
def update(self, blocks: List[Block]):
|
||||||
|
self._blocks = blocks
|
||||||
|
|
||||||
|
# Cache block ids for fast query
|
||||||
|
self._block_ids = []
|
||||||
|
for block in self._blocks:
|
||||||
|
self._add_block_id(block.block_id)
|
||||||
|
|
||||||
|
def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
|
||||||
|
block = self._blocks[block_index]
|
||||||
|
prev_block_id = block.block_id
|
||||||
|
|
||||||
|
block.append_token_ids(token_ids)
|
||||||
|
|
||||||
|
# CoW or promotion may update the internal block_id
|
||||||
|
if prev_block_id != block.block_id:
|
||||||
|
self._update_block_id(block_index, block.block_id)
|
||||||
|
|
||||||
|
def append(self, new_block: Block):
|
||||||
|
self._blocks.append(new_block)
|
||||||
|
self._add_block_id(new_block.block_id)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._blocks)
|
||||||
|
|
||||||
|
def __getitem__(self, block_index: int) -> Block:
|
||||||
|
return self._blocks[block_index]
|
||||||
|
|
||||||
|
def __setitem__(self, block_index: int, new_block: Block) -> None:
|
||||||
|
self._blocks[block_index] = new_block
|
||||||
|
self._update_block_id(block_index, new_block.block_id)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self._blocks = []
|
||||||
|
self._block_ids = []
|
||||||
|
|
||||||
|
def list(self) -> List[Block]:
|
||||||
|
return self._blocks
|
||||||
|
|
||||||
|
def ids(self) -> List[int]:
|
||||||
|
return self._block_ids
|
||||||
|
|
||||||
|
|
||||||
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
|
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
|
||||||
"""Retrieves all the blocks in a sequence starting from the last block.
|
"""Retrieves all the blocks in a sequence starting from the last block.
|
||||||
|
|
||||||
|
@ -113,11 +113,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
def allocate_or_get_null_block(self) -> Block:
|
def allocate_or_get_null_block(self) -> Block:
|
||||||
if self._null_block is None:
|
if self._null_block is None:
|
||||||
self._null_block = NullBlock(
|
self._null_block = NullBlock(
|
||||||
self.allocate_mutable(None, Device.GPU))
|
self.allocate_mutable_block(None, Device.GPU))
|
||||||
return self._null_block
|
return self._null_block
|
||||||
|
|
||||||
def allocate_mutable(self, prev_block: Optional[Block],
|
def allocate_mutable_block(self, prev_block: Optional[Block],
|
||||||
device: Device) -> Block:
|
device: Device) -> Block:
|
||||||
"""Allocates a new mutable block on the specified device.
|
"""Allocates a new mutable block on the specified device.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -128,10 +128,31 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
Returns:
|
Returns:
|
||||||
Block: The newly allocated mutable block.
|
Block: The newly allocated mutable block.
|
||||||
"""
|
"""
|
||||||
return self._allocators[device].allocate_mutable(prev_block)
|
return self._allocators[device].allocate_mutable_block(prev_block)
|
||||||
|
|
||||||
def allocate_immutable(self, prev_block: Optional[Block],
|
def allocate_immutable_blocks(self, prev_block: Optional[Block],
|
||||||
token_ids: List[int], device: Device) -> Block:
|
block_token_ids: List[List[int]],
|
||||||
|
device: Optional[Device]) -> List[Block]:
|
||||||
|
"""Allocates a new group of immutable blocks with the provided block
|
||||||
|
token IDs on the specified device.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prev_block (Optional[Block]): The previous block in the sequence.
|
||||||
|
Used for prefix hashing.
|
||||||
|
block_token_ids (List[int]): The list of block token IDs to be
|
||||||
|
stored in the new blocks.
|
||||||
|
device (Device): The device on which to allocate the new block.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Block]: The newly allocated list of immutable blocks
|
||||||
|
containing the provided block token IDs.
|
||||||
|
"""
|
||||||
|
return self._allocators[device].allocate_immutable_blocks(
|
||||||
|
prev_block, block_token_ids)
|
||||||
|
|
||||||
|
def allocate_immutable_block(self, prev_block: Optional[Block],
|
||||||
|
token_ids: List[int],
|
||||||
|
device: Device) -> Block:
|
||||||
"""Allocates a new immutable block with the provided token IDs on the
|
"""Allocates a new immutable block with the provided token IDs on the
|
||||||
specified device.
|
specified device.
|
||||||
|
|
||||||
@ -146,7 +167,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
Block: The newly allocated immutable block containing the provided
|
Block: The newly allocated immutable block containing the provided
|
||||||
token IDs.
|
token IDs.
|
||||||
"""
|
"""
|
||||||
return self._allocators[device].allocate_immutable(
|
return self._allocators[device].allocate_immutable_block(
|
||||||
prev_block, token_ids)
|
prev_block, token_ids)
|
||||||
|
|
||||||
def free(self, block: Block) -> None:
|
def free(self, block: Block) -> None:
|
||||||
@ -161,7 +182,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
block_id = block.block_id
|
block_id = block.block_id
|
||||||
assert block_id is not None
|
assert block_id is not None
|
||||||
allocator = self._block_ids_to_allocator[block_id]
|
allocator = self._block_ids_to_allocator[block_id]
|
||||||
return allocator.free(block)
|
allocator.free(block)
|
||||||
|
|
||||||
def fork(self, last_block: Block) -> List[Block]:
|
def fork(self, last_block: Block) -> List[Block]:
|
||||||
"""Creates a new sequence of blocks that shares the same underlying
|
"""Creates a new sequence of blocks that shares the same underlying
|
||||||
@ -210,8 +231,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
"""
|
"""
|
||||||
return self._allocators[device].get_physical_block_id(absolute_id)
|
return self._allocators[device].get_physical_block_id(absolute_id)
|
||||||
|
|
||||||
def swap(self, blocks: List[Block], source_device: Device,
|
def swap(self, blocks: List[Block], src_device: Device,
|
||||||
dest_device: Device) -> Dict[int, int]:
|
dst_device: Device) -> Dict[int, int]:
|
||||||
"""Execute the swap for the given blocks from source_device
|
"""Execute the swap for the given blocks from source_device
|
||||||
on to dest_device, save the current swap mapping and append
|
on to dest_device, save the current swap mapping and append
|
||||||
them to the accumulated `self._swap_mapping` for each
|
them to the accumulated `self._swap_mapping` for each
|
||||||
@ -219,23 +240,23 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
blocks: List of blocks to be swapped.
|
blocks: List of blocks to be swapped.
|
||||||
source_device (Device): Device to swap the 'blocks' from.
|
src_device (Device): Device to swap the 'blocks' from.
|
||||||
dest_device (Device): Device to swap the 'blocks' to.
|
dst_device (Device): Device to swap the 'blocks' to.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[int, int]: Swap mapping from source_device
|
Dict[int, int]: Swap mapping from source_device
|
||||||
on to dest_device.
|
on to dest_device.
|
||||||
"""
|
"""
|
||||||
source_block_ids = [block.block_id for block in blocks]
|
src_block_ids = [block.block_id for block in blocks]
|
||||||
self._allocators[source_device].swap_out(blocks)
|
self._allocators[src_device].swap_out(blocks)
|
||||||
self._allocators[dest_device].swap_in(blocks)
|
self._allocators[dst_device].swap_in(blocks)
|
||||||
dest_block_ids = [block.block_id for block in blocks]
|
dst_block_ids = [block.block_id for block in blocks]
|
||||||
|
|
||||||
current_swap_mapping: Dict[int, int] = {}
|
current_swap_mapping: Dict[int, int] = {}
|
||||||
for src, dest in zip(source_block_ids, dest_block_ids):
|
for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
|
||||||
if src is not None and dest is not None:
|
if src_block_id is not None and dst_block_id is not None:
|
||||||
self._swap_mapping[src] = dest
|
self._swap_mapping[src_block_id] = dst_block_id
|
||||||
current_swap_mapping[src] = dest
|
current_swap_mapping[src_block_id] = dst_block_id
|
||||||
return current_swap_mapping
|
return current_swap_mapping
|
||||||
|
|
||||||
def get_num_blocks_touched(self,
|
def get_num_blocks_touched(self,
|
||||||
@ -283,23 +304,25 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
|||||||
device = Device.GPU
|
device = Device.GPU
|
||||||
return self._allocators[device].mark_blocks_as_computed(block_ids)
|
return self._allocators[device].mark_blocks_as_computed(block_ids)
|
||||||
|
|
||||||
|
def get_computed_block_ids(self, prev_computed_block_ids: List[int],
|
||||||
|
block_ids: List[int],
|
||||||
|
skip_last_block_id: bool) -> List[int]:
|
||||||
|
# Prefix caching only supported on GPU.
|
||||||
|
device = Device.GPU
|
||||||
|
return self._allocators[device].get_computed_block_ids(
|
||||||
|
prev_computed_block_ids, block_ids, skip_last_block_id)
|
||||||
|
|
||||||
def get_common_computed_block_ids(
|
def get_common_computed_block_ids(
|
||||||
self, seq_block_ids: List[List[int]]) -> List[int]:
|
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||||
# Prefix caching only supported on GPU.
|
# Prefix caching only supported on GPU.
|
||||||
device = Device.GPU
|
device = Device.GPU
|
||||||
return self._allocators[device].get_common_computed_block_ids(
|
return self._allocators[device].get_common_computed_block_ids(
|
||||||
seq_block_ids)
|
computed_seq_block_ids)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_block_ids(self) -> FrozenSet[int]:
|
def all_block_ids(self) -> FrozenSet[int]:
|
||||||
return frozenset(self._block_ids_to_allocator.keys())
|
return frozenset(self._block_ids_to_allocator.keys())
|
||||||
|
|
||||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
|
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
|
||||||
"""Returns and clears the mapping of source to destination block IDs.
|
"""Returns and clears the mapping of source to destination block IDs.
|
||||||
Will be called after every swapping operations for now, and after every
|
Will be called after every swapping operations for now, and after every
|
||||||
@ -341,6 +364,11 @@ class NullBlock(Block):
|
|||||||
def token_ids(self) -> List[BlockId]:
|
def token_ids(self) -> List[BlockId]:
|
||||||
return self._proxy.token_ids
|
return self._proxy.token_ids
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_tokens_total(self) -> int:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"num_tokens_total is not used for null block")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_empty_slots(self) -> BlockId:
|
def num_empty_slots(self) -> BlockId:
|
||||||
return self._proxy.num_empty_slots
|
return self._proxy.num_empty_slots
|
||||||
|
@ -28,6 +28,13 @@ class Block(ABC):
|
|||||||
def token_ids(self) -> List[int]:
|
def token_ids(self) -> List[int]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def num_tokens_total(self) -> int:
|
||||||
|
"""The number of tokens till the current block (inclusive)
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def num_empty_slots(self) -> int:
|
def num_empty_slots(self) -> int:
|
||||||
@ -92,12 +99,18 @@ class Block(ABC):
|
|||||||
class BlockAllocator(ABC):
|
class BlockAllocator(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
|
def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def allocate_immutable(self, prev_block: Optional[Block],
|
def allocate_immutable_block(self, prev_block: Optional[Block],
|
||||||
token_ids: List[int]) -> Block:
|
token_ids: List[int]) -> Block:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def allocate_immutable_blocks(
|
||||||
|
self, prev_block: Optional[Block],
|
||||||
|
block_token_ids: List[List[int]]) -> List[Block]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -147,12 +160,18 @@ class BlockAllocator(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_common_computed_block_ids(
|
def get_computed_block_ids(self, prev_computed_block_ids: List[int],
|
||||||
self, seq_block_ids: List[List[int]]) -> List[int]:
|
block_ids: List[int],
|
||||||
|
skip_last_block_id: bool) -> List[int]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def cow_block_if_not_appendable(self, block: Block) -> Optional["BlockId"]:
|
def get_common_computed_block_ids(
|
||||||
|
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||||
"""NOTE: This should not be used besides Block"""
|
"""NOTE: This should not be used besides Block"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -174,13 +193,20 @@ class BlockAllocator(ABC):
|
|||||||
class DeviceAwareBlockAllocator(ABC):
|
class DeviceAwareBlockAllocator(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def allocate_mutable(self, prev_block: Optional[Block],
|
def allocate_mutable_block(self, prev_block: Optional[Block],
|
||||||
device: Device) -> Block:
|
device: Device) -> Block:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def allocate_immutable(self, prev_block: Optional[Block],
|
def allocate_immutable_block(self, prev_block: Optional[Block],
|
||||||
token_ids: List[int], device: Device) -> Block:
|
token_ids: List[int],
|
||||||
|
device: Device) -> Block:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def allocate_immutable_blocks(self, prev_block: Optional[Block],
|
||||||
|
block_token_ids: List[List[int]],
|
||||||
|
device: Device) -> List[Block]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -217,9 +243,15 @@ class DeviceAwareBlockAllocator(ABC):
|
|||||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_computed_block_ids(self, prev_computed_block_ids: List[int],
|
||||||
|
block_ids: List[int],
|
||||||
|
skip_last_block_id: bool) -> List[int]:
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_common_computed_block_ids(
|
def get_common_computed_block_ids(
|
||||||
self, seq_block_ids: List[List[int]]) -> List[int]:
|
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -230,8 +262,8 @@ class DeviceAwareBlockAllocator(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def swap(self, blocks: List[Block], source_device: Device,
|
def swap(self, blocks: List[Block], src_device: Device,
|
||||||
dest_device: Device) -> Dict[int, int]:
|
dst_device: Device) -> Dict[int, int]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import FrozenSet, Iterable, List, Optional, Set, Tuple
|
from collections import deque
|
||||||
|
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
|
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
|
||||||
get_all_blocks_recursively)
|
get_all_blocks_recursively)
|
||||||
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
||||||
from vllm.utils import cdiv
|
from vllm.utils import cdiv
|
||||||
@ -31,28 +32,39 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
block_ids: Optional[Iterable[int]] = None,
|
block_ids: Optional[Iterable[int]] = None,
|
||||||
|
block_pool: Optional[BlockPool] = None,
|
||||||
):
|
):
|
||||||
if block_ids is None:
|
if block_ids is None:
|
||||||
block_ids = range(num_blocks)
|
block_ids = range(num_blocks)
|
||||||
|
|
||||||
self._free_block_indices: Set[BlockId] = set(block_ids)
|
self._free_block_indices: Deque[BlockId] = deque(block_ids)
|
||||||
self._all_block_indices = frozenset(block_ids)
|
self._all_block_indices = frozenset(block_ids)
|
||||||
assert len(self._all_block_indices) == num_blocks
|
assert len(self._all_block_indices) == num_blocks
|
||||||
|
|
||||||
self._refcounter = RefCounter(
|
self._refcounter = RefCounter(
|
||||||
all_block_indices=self._free_block_indices)
|
all_block_indices=self._free_block_indices)
|
||||||
self._create_block = create_block
|
|
||||||
self._block_size = block_size
|
self._block_size = block_size
|
||||||
|
|
||||||
self._cow_tracker = CopyOnWriteTracker(
|
self._cow_tracker = CopyOnWriteTracker(
|
||||||
refcounter=self._refcounter.as_readonly(),
|
refcounter=self._refcounter.as_readonly())
|
||||||
allocator=self,
|
|
||||||
)
|
|
||||||
|
|
||||||
def allocate_immutable(self,
|
if block_pool is None:
|
||||||
prev_block: Optional[Block],
|
extra_factor = 4
|
||||||
token_ids: List[int],
|
# Pre-allocate "num_blocks * extra_factor" block objects.
|
||||||
device: Optional[Device] = None) -> Block:
|
# The "* extra_factor" is a buffer to allow more block objects
|
||||||
|
# than physical blocks
|
||||||
|
self._block_pool = BlockPool(self._block_size, create_block, self,
|
||||||
|
num_blocks * extra_factor)
|
||||||
|
else:
|
||||||
|
# In this case, the block pool is provided by the caller,
|
||||||
|
# which means that there is most likely a need to share
|
||||||
|
# a block pool between allocators
|
||||||
|
self._block_pool = block_pool
|
||||||
|
|
||||||
|
def allocate_immutable_block(self,
|
||||||
|
prev_block: Optional[Block],
|
||||||
|
token_ids: List[int],
|
||||||
|
device: Optional[Device] = None) -> Block:
|
||||||
"""Allocates a new immutable block with the given token IDs, linked to
|
"""Allocates a new immutable block with the given token IDs, linked to
|
||||||
the previous block.
|
the previous block.
|
||||||
|
|
||||||
@ -66,13 +78,36 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
Block: The newly allocated immutable block.
|
Block: The newly allocated immutable block.
|
||||||
"""
|
"""
|
||||||
assert device is None
|
assert device is None
|
||||||
block = self.allocate_mutable(prev_block=prev_block)
|
block = self.allocate_mutable_block(prev_block=prev_block)
|
||||||
block.append_token_ids(token_ids)
|
block.append_token_ids(token_ids)
|
||||||
return block
|
return block
|
||||||
|
|
||||||
def allocate_mutable(self,
|
def allocate_immutable_blocks(
|
||||||
prev_block: Optional[Block],
|
self,
|
||||||
device: Optional[Device] = None) -> Block:
|
prev_block: Optional[Block],
|
||||||
|
block_token_ids: List[List[int]],
|
||||||
|
device: Optional[Device] = None) -> List[Block]:
|
||||||
|
assert device is None
|
||||||
|
num_blocks = len(block_token_ids)
|
||||||
|
|
||||||
|
block_ids = []
|
||||||
|
for i in range(num_blocks):
|
||||||
|
block_ids.append(self._allocate_block_id())
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
for i in range(num_blocks):
|
||||||
|
prev_block = self._block_pool.init_block(
|
||||||
|
prev_block=prev_block,
|
||||||
|
token_ids=block_token_ids[i],
|
||||||
|
block_size=self._block_size,
|
||||||
|
physical_block_id=block_ids[i])
|
||||||
|
blocks.append(prev_block)
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def allocate_mutable_block(self,
|
||||||
|
prev_block: Optional[Block],
|
||||||
|
device: Optional[Device] = None) -> Block:
|
||||||
"""Allocates a new mutable block, linked to the previous block.
|
"""Allocates a new mutable block, linked to the previous block.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -84,20 +119,39 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
Block: The newly allocated mutable block.
|
Block: The newly allocated mutable block.
|
||||||
"""
|
"""
|
||||||
assert device is None
|
assert device is None
|
||||||
block_id = self._allocate_new_block_id()
|
block_id = self._allocate_block_id()
|
||||||
return self._create_block(
|
block = self._block_pool.init_block(prev_block=prev_block,
|
||||||
prev_block=prev_block,
|
token_ids=[],
|
||||||
token_ids=[],
|
block_size=self._block_size,
|
||||||
block_id=block_id,
|
physical_block_id=block_id)
|
||||||
block_size=self._block_size,
|
return block
|
||||||
allocator=self,
|
|
||||||
)
|
def _allocate_block_id(self) -> BlockId:
|
||||||
|
if not self._free_block_indices:
|
||||||
|
raise BlockAllocator.NoFreeBlocksError()
|
||||||
|
|
||||||
|
block_id = self._free_block_indices.popleft()
|
||||||
|
self._refcounter.incr(block_id)
|
||||||
|
return block_id
|
||||||
|
|
||||||
|
def _free_block_id(self, block: Block) -> None:
|
||||||
|
block_id = block.block_id
|
||||||
|
assert block_id is not None
|
||||||
|
|
||||||
|
refcount = self._refcounter.decr(block_id)
|
||||||
|
if refcount == 0:
|
||||||
|
self._free_block_indices.appendleft(block_id)
|
||||||
|
|
||||||
def free(self, block: Block) -> None:
|
|
||||||
assert block.block_id is not None
|
|
||||||
self._free_block_id(block.block_id)
|
|
||||||
block.block_id = None
|
block.block_id = None
|
||||||
|
|
||||||
|
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
||||||
|
# Release the physical block id
|
||||||
|
self._free_block_id(block)
|
||||||
|
|
||||||
|
# Release the block object
|
||||||
|
if not keep_block_object:
|
||||||
|
self._block_pool.free_block(block)
|
||||||
|
|
||||||
def fork(self, last_block: Block) -> List[Block]:
|
def fork(self, last_block: Block) -> List[Block]:
|
||||||
"""Creates a new sequence of blocks that shares the same underlying
|
"""Creates a new sequence of blocks that shares the same underlying
|
||||||
memory as the original sequence.
|
memory as the original sequence.
|
||||||
@ -120,14 +174,13 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
refcount = self._refcounter.incr(block.block_id)
|
refcount = self._refcounter.incr(block.block_id)
|
||||||
assert refcount != 1, "can't fork free'd block"
|
assert refcount != 1, "can't fork free'd block"
|
||||||
|
|
||||||
forked_blocks.append(
|
forked_block = self._block_pool.init_block(
|
||||||
self._create_block(
|
prev_block=prev_block,
|
||||||
prev_block=prev_block,
|
token_ids=block.token_ids,
|
||||||
token_ids=block.token_ids,
|
block_size=self._block_size,
|
||||||
block_id=block.block_id,
|
physical_block_id=block.block_id)
|
||||||
block_size=self._block_size,
|
|
||||||
allocator=self,
|
forked_blocks.append(forked_block)
|
||||||
))
|
|
||||||
prev_block = forked_blocks[-1]
|
prev_block = forked_blocks[-1]
|
||||||
|
|
||||||
return forked_blocks
|
return forked_blocks
|
||||||
@ -138,20 +191,6 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
def get_num_total_blocks(self) -> int:
|
def get_num_total_blocks(self) -> int:
|
||||||
return len(self._all_block_indices)
|
return len(self._all_block_indices)
|
||||||
|
|
||||||
def _allocate_new_block_id(self) -> BlockId:
|
|
||||||
if not self._free_block_indices:
|
|
||||||
raise BlockAllocator.NoFreeBlocksError()
|
|
||||||
|
|
||||||
block_id = next(iter(self._free_block_indices))
|
|
||||||
self._refcounter.incr(block_id)
|
|
||||||
self._free_block_indices.remove(block_id)
|
|
||||||
return block_id
|
|
||||||
|
|
||||||
def _free_block_id(self, block_id: BlockId) -> None:
|
|
||||||
refcount = self._refcounter.decr(block_id)
|
|
||||||
if refcount == 0:
|
|
||||||
self._free_block_indices.add(block_id)
|
|
||||||
|
|
||||||
def get_physical_block_id(self, absolute_id: int) -> int:
|
def get_physical_block_id(self, absolute_id: int) -> int:
|
||||||
"""Returns the zero-offset block id on certain block allocator
|
"""Returns the zero-offset block id on certain block allocator
|
||||||
given the absolute block id.
|
given the absolute block id.
|
||||||
@ -173,7 +212,7 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
def all_block_ids(self) -> FrozenSet[int]:
|
def all_block_ids(self) -> FrozenSet[int]:
|
||||||
return self._all_block_indices
|
return self._all_block_indices
|
||||||
|
|
||||||
def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
|
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||||
"""Performs a copy-on-write operation on the given block if it is not
|
"""Performs a copy-on-write operation on the given block if it is not
|
||||||
appendable.
|
appendable.
|
||||||
|
|
||||||
@ -181,11 +220,22 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
block (Block): The block to check for copy-on-write.
|
block (Block): The block to check for copy-on-write.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Optional[BlockId]: The block index of the new block if a copy-on
|
BlockId: The block index of the new block if a copy-on-write
|
||||||
-write operation was performed, or the original block index if
|
operation was performed, or the original block index if
|
||||||
no copy-on-write was necessary.
|
no copy-on-write was necessary.
|
||||||
"""
|
"""
|
||||||
return self._cow_tracker.cow_block_if_not_appendable(block)
|
src_block_id = block.block_id
|
||||||
|
assert src_block_id is not None
|
||||||
|
|
||||||
|
if self._cow_tracker.is_appendable(block):
|
||||||
|
return src_block_id
|
||||||
|
|
||||||
|
self._free_block_id(block)
|
||||||
|
trg_block_id = self._allocate_block_id()
|
||||||
|
|
||||||
|
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
||||||
|
|
||||||
|
return trg_block_id
|
||||||
|
|
||||||
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
||||||
"""Returns the copy-on-write source->destination mapping and clears it.
|
"""Returns the copy-on-write source->destination mapping and clears it.
|
||||||
@ -213,8 +263,15 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def get_computed_block_ids(self, prev_computed_block_ids: List[int],
|
||||||
|
block_ids: List[int],
|
||||||
|
skip_last_block_id: bool) -> List[int]:
|
||||||
|
"""No prefix caching here => return empty list
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
def get_common_computed_block_ids(
|
def get_common_computed_block_ids(
|
||||||
self, seq_block_ids: List[List[int]]) -> List[int]:
|
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||||
"""Determine blocks that can be skipped in prefill.
|
"""Determine blocks that can be skipped in prefill.
|
||||||
|
|
||||||
Since the naive allocator does not support prefix caching, always return
|
Since the naive allocator does not support prefix caching, always return
|
||||||
@ -223,7 +280,7 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
||||||
raise NotImplementedError
|
raise NotImplementedError("There is no promotion for naive blocks")
|
||||||
|
|
||||||
def get_num_blocks_touched(self,
|
def get_num_blocks_touched(self,
|
||||||
blocks: List[Block],
|
blocks: List[Block],
|
||||||
@ -263,17 +320,27 @@ class NaiveBlockAllocator(BlockAllocator):
|
|||||||
|
|
||||||
def swap_out(self, blocks: List[Block]) -> None:
|
def swap_out(self, blocks: List[Block]) -> None:
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
self.free(block)
|
self._free_block_id(block)
|
||||||
|
|
||||||
def swap_in(self, blocks: List[Block]) -> None:
|
def swap_in(self, blocks: List[Block]) -> None:
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
|
# Here we allocate either immutable or mutable block and then
|
||||||
|
# extract its block_id. Note that the block object is released
|
||||||
|
# and the block_id is assigned to "block" to allow reusing the
|
||||||
|
# existing "block" object
|
||||||
if block.is_full:
|
if block.is_full:
|
||||||
alloc = self.allocate_immutable(block.prev_block,
|
tmp_block = self.allocate_immutable_block(
|
||||||
block.token_ids)
|
prev_block=block.prev_block, token_ids=block.token_ids)
|
||||||
else:
|
else:
|
||||||
alloc = self.allocate_mutable(block.prev_block)
|
tmp_block = self.allocate_mutable_block(
|
||||||
alloc.append_token_ids(block.token_ids)
|
prev_block=block.prev_block)
|
||||||
block.block_id = alloc.block_id
|
tmp_block.append_token_ids(block.token_ids)
|
||||||
|
|
||||||
|
block_id = tmp_block.block_id
|
||||||
|
tmp_block.block_id = None
|
||||||
|
self._block_pool.free_block(tmp_block)
|
||||||
|
|
||||||
|
block.block_id = block_id # Assign block_id
|
||||||
|
|
||||||
|
|
||||||
class NaiveBlock(Block):
|
class NaiveBlock(Block):
|
||||||
@ -315,11 +382,12 @@ class NaiveBlock(Block):
|
|||||||
self._append_token_ids_no_cow(token_ids)
|
self._append_token_ids_no_cow(token_ids)
|
||||||
|
|
||||||
def append_token_ids(self, token_ids: List[int]) -> None:
|
def append_token_ids(self, token_ids: List[int]) -> None:
|
||||||
"""Appends the given token IDs to the block, instructing the allocator
|
"""Appends the given token IDs to the block and performs a
|
||||||
to perform a copy-on-write if necessary.
|
copy-on-write if necessary.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids (List[int]): The token IDs to be appended to the block.
|
token_ids (Optional[List[int]]): The token IDs to be appended
|
||||||
|
to the block.
|
||||||
"""
|
"""
|
||||||
self._append_token_ids_no_cow(token_ids)
|
self._append_token_ids_no_cow(token_ids)
|
||||||
|
|
||||||
@ -328,7 +396,16 @@ class NaiveBlock(Block):
|
|||||||
self._cow_target))
|
self._cow_target))
|
||||||
|
|
||||||
def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
|
def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
|
||||||
assert self.num_empty_slots >= len(token_ids)
|
"""Appends the given token IDs to the block
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids (List[int]): The token IDs to be appended to the block.
|
||||||
|
"""
|
||||||
|
if len(token_ids) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
assert len(token_ids) <= self.num_empty_slots
|
||||||
|
|
||||||
self._token_ids.extend(token_ids)
|
self._token_ids.extend(token_ids)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -361,12 +438,17 @@ class NaiveBlock(Block):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def num_empty_slots(self) -> int:
|
def num_empty_slots(self) -> int:
|
||||||
return self._block_size - len(self._token_ids)
|
return self._block_size - len(self.token_ids)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def token_ids(self) -> List[int]:
|
def token_ids(self) -> List[int]:
|
||||||
return self._token_ids
|
return self._token_ids
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_tokens_total(self) -> int:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"num_tokens_total is not used for naive block")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def block_size(self) -> int:
|
def block_size(self) -> int:
|
||||||
return self._block_size
|
return self._block_size
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
"""Token blocks."""
|
"""Token blocks."""
|
||||||
|
|
||||||
from itertools import takewhile
|
|
||||||
from os.path import commonprefix
|
from os.path import commonprefix
|
||||||
from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
|
from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
from vllm.core.block.common import (CopyOnWriteTracker,
|
from vllm.core.block.common import (CopyOnWriteTracker,
|
||||||
get_all_blocks_recursively)
|
get_all_blocks_recursively)
|
||||||
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
||||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
|
||||||
|
NaiveBlockAllocator)
|
||||||
from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
|
from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
|
||||||
from vllm.utils import cdiv
|
from vllm.utils import cdiv
|
||||||
|
|
||||||
@ -19,6 +19,30 @@ PrefixHash = int
|
|||||||
_DEFAULT_LAST_ACCESSED_TIME = -1
|
_DEFAULT_LAST_ACCESSED_TIME = -1
|
||||||
|
|
||||||
|
|
||||||
|
class BlockTracker:
|
||||||
|
"""Used to track the status of a block inside the prefix caching allocator
|
||||||
|
"""
|
||||||
|
__slots__ = ("active", "last_accessed", "computed")
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
||||||
|
self.computed: bool = False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.active: bool = False
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def enable(self):
|
||||||
|
assert not self.active
|
||||||
|
self.active = True
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def disable(self):
|
||||||
|
assert self.active
|
||||||
|
self.active = False
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
|
||||||
class PrefixCachingBlockAllocator(BlockAllocator):
|
class PrefixCachingBlockAllocator(BlockAllocator):
|
||||||
"""A block allocator that implements prefix caching.
|
"""A block allocator that implements prefix caching.
|
||||||
|
|
||||||
@ -41,12 +65,26 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
block_ids: Optional[Iterable[int]] = None,
|
block_ids: Optional[Iterable[int]] = None,
|
||||||
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
|
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
|
||||||
):
|
):
|
||||||
|
if block_ids is None:
|
||||||
|
block_ids = range(num_blocks)
|
||||||
|
|
||||||
|
self._block_size = block_size
|
||||||
|
|
||||||
# A mapping of prefix hash to block index. All blocks which have a
|
# A mapping of prefix hash to block index. All blocks which have a
|
||||||
# prefix hash will be in this dict, even if they have refcount 0.
|
# prefix hash will be in this dict, even if they have refcount 0.
|
||||||
self._cached_blocks: Dict[PrefixHash, BlockId] = {}
|
self._cached_blocks: Dict[PrefixHash, BlockId] = {}
|
||||||
|
|
||||||
# A mapping of blockId to Block to track those cached blocks
|
# Used to track status of each physical block id
|
||||||
self._blocks: Dict[BlockId, Block] = {}
|
self._block_tracker: Dict[BlockId, BlockTracker] = {}
|
||||||
|
for block_id in block_ids:
|
||||||
|
self._block_tracker[block_id] = BlockTracker()
|
||||||
|
|
||||||
|
# Pre-allocate "num_blocks * extra_factor" block objects.
|
||||||
|
# The "* extra_factor" is a buffer to allow more block objects
|
||||||
|
# than physical blocks
|
||||||
|
extra_factor = 4
|
||||||
|
self._block_pool = BlockPool(self._block_size, self._create_block,
|
||||||
|
self, num_blocks * extra_factor)
|
||||||
|
|
||||||
# An allocator for blocks that do not have prefix hashes.
|
# An allocator for blocks that do not have prefix hashes.
|
||||||
self._hashless_allocator = NaiveBlockAllocator(
|
self._hashless_allocator = NaiveBlockAllocator(
|
||||||
@ -54,10 +92,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
num_blocks=num_blocks,
|
num_blocks=num_blocks,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
block_ids=block_ids,
|
block_ids=block_ids,
|
||||||
|
block_pool=self._block_pool, # Share block pool here
|
||||||
)
|
)
|
||||||
|
|
||||||
self._block_size = block_size
|
|
||||||
|
|
||||||
# Evitor used to maintain how we want to handle those computed blocks
|
# Evitor used to maintain how we want to handle those computed blocks
|
||||||
# if we find memory pressure is high.
|
# if we find memory pressure is high.
|
||||||
self.evictor: Evictor = make_evictor(eviction_policy)
|
self.evictor: Evictor = make_evictor(eviction_policy)
|
||||||
@ -68,9 +105,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
self._refcounter = self._hashless_allocator.refcounter
|
self._refcounter = self._hashless_allocator.refcounter
|
||||||
|
|
||||||
self._cow_tracker = CopyOnWriteTracker(
|
self._cow_tracker = CopyOnWriteTracker(
|
||||||
refcounter=self._refcounter.as_readonly(),
|
refcounter=self._refcounter.as_readonly())
|
||||||
allocator=self,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Implements Block.Factory.
|
# Implements Block.Factory.
|
||||||
def _create_block(
|
def _create_block(
|
||||||
@ -90,14 +125,14 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
block_id=block_id,
|
block_id=block_id,
|
||||||
prefix_caching_allocator=allocator,
|
allocator=allocator,
|
||||||
computed=computed,
|
computed=computed,
|
||||||
)
|
)
|
||||||
|
|
||||||
def allocate_immutable(self,
|
def allocate_immutable_block(self,
|
||||||
prev_block: Optional[Block],
|
prev_block: Optional[Block],
|
||||||
token_ids: List[int],
|
token_ids: List[int],
|
||||||
device: Optional[Device] = None) -> Block:
|
device: Optional[Device] = None) -> Block:
|
||||||
"""Allocates an immutable block with the given token IDs, reusing cached
|
"""Allocates an immutable block with the given token IDs, reusing cached
|
||||||
blocks if possible.
|
blocks if possible.
|
||||||
|
|
||||||
@ -111,29 +146,41 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
assert device is None
|
assert device is None
|
||||||
assert_prefix_caching_block_or_none(prev_block)
|
assert_prefix_caching_block_or_none(prev_block)
|
||||||
|
|
||||||
block = self._create_block(
|
# First, try to create a block that points to cached data
|
||||||
prev_block=prev_block,
|
block = self._block_pool.init_block(prev_block=prev_block,
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
block_size=self._block_size,
|
block_size=self._block_size,
|
||||||
allocator=self,
|
physical_block_id=None)
|
||||||
)
|
|
||||||
assert block.content_hash is not None
|
assert block.content_hash is not None
|
||||||
|
|
||||||
cached_block_id = self._cached_blocks.get(block.content_hash, None)
|
cached_block_id = self._cached_blocks.get(block.content_hash, None)
|
||||||
if cached_block_id is not None:
|
if cached_block_id is not None:
|
||||||
block.block_id = cached_block_id
|
block.block_id = cached_block_id
|
||||||
self._incr_refcount_cached_block(block, block.block_id)
|
self._incr_refcount_cached_block(block)
|
||||||
return block
|
return block
|
||||||
|
self._block_pool.free_block(block)
|
||||||
|
|
||||||
block = self.allocate_mutable(prev_block)
|
# No cached block => Allocate a new block
|
||||||
|
block = self.allocate_mutable_block(prev_block)
|
||||||
block.append_token_ids(token_ids)
|
block.append_token_ids(token_ids)
|
||||||
assert block.content_hash is not None
|
|
||||||
|
|
||||||
return block
|
return block
|
||||||
|
|
||||||
def allocate_mutable(self,
|
def allocate_immutable_blocks(
|
||||||
prev_block: Optional[Block],
|
self,
|
||||||
device: Optional[Device] = None) -> Block:
|
prev_block: Optional[Block],
|
||||||
|
block_token_ids: List[List[int]],
|
||||||
|
device: Optional[Device] = None) -> List[Block]:
|
||||||
|
blocks = []
|
||||||
|
for token_ids in block_token_ids:
|
||||||
|
prev_block = self.allocate_immutable_block(prev_block=prev_block,
|
||||||
|
token_ids=token_ids,
|
||||||
|
device=device)
|
||||||
|
blocks.append(prev_block)
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def allocate_mutable_block(self,
|
||||||
|
prev_block: Optional[Block],
|
||||||
|
device: Optional[Device] = None) -> Block:
|
||||||
"""Allocates a mutable block. If there are no free blocks, this will
|
"""Allocates a mutable block. If there are no free blocks, this will
|
||||||
evict unused cached blocks.
|
evict unused cached blocks.
|
||||||
|
|
||||||
@ -147,116 +194,154 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
assert device is None
|
assert device is None
|
||||||
assert_prefix_caching_block_or_none(prev_block)
|
assert_prefix_caching_block_or_none(prev_block)
|
||||||
|
|
||||||
try:
|
block_id = self._allocate_block_id()
|
||||||
block = self._hashless_allocator.allocate_mutable(
|
block = self._block_pool.init_block(prev_block=prev_block,
|
||||||
prev_block=prev_block)
|
token_ids=[],
|
||||||
|
block_size=self._block_size,
|
||||||
|
physical_block_id=block_id)
|
||||||
|
assert not block.computed
|
||||||
|
assert block.content_hash is None
|
||||||
|
return block
|
||||||
|
|
||||||
assert block.block_id not in self._blocks
|
def _incr_refcount_cached_block(self, block: Block) -> None:
|
||||||
assert block.block_id is not None
|
# Set this block to be "computed" since it is pointing to a
|
||||||
self._blocks[block.block_id] = block
|
# cached block id (which was already computed)
|
||||||
return block
|
block.computed = True
|
||||||
except BlockAllocator.NoFreeBlocksError:
|
|
||||||
# We must check the unused cached blocks before raising OOM.
|
|
||||||
pass
|
|
||||||
|
|
||||||
# If the evictor has blocks available for eviction, evict a block
|
block_id = block.block_id
|
||||||
# and return it.
|
assert block_id is not None
|
||||||
if self.evictor.num_blocks > 0:
|
|
||||||
# here we get an evicted block, which is only added
|
|
||||||
# into evictor if its ref counter is 0
|
|
||||||
# and since its content would be changed, we need
|
|
||||||
# to remove it from _cached_blocks's tracking list
|
|
||||||
block_id, content_hash_to_evict = self.evictor.evict()
|
|
||||||
|
|
||||||
_block_id = self._cached_blocks[content_hash_to_evict]
|
refcount = self._refcounter.incr(block_id)
|
||||||
assert self._refcounter.get(_block_id) == 0
|
if refcount == 1:
|
||||||
assert _block_id == block_id
|
# In case a cached block was evicted, restore its tracking
|
||||||
|
if block_id in self.evictor:
|
||||||
|
self.evictor.remove(block_id)
|
||||||
|
|
||||||
self._cached_blocks.pop(content_hash_to_evict)
|
self._track_block_id(block_id, computed=True)
|
||||||
|
|
||||||
self._refcounter.incr(block_id)
|
def _decr_refcount_cached_block(self, block: Block) -> None:
|
||||||
|
# Ensure this is immutable/cached block
|
||||||
|
assert block.content_hash is not None
|
||||||
|
|
||||||
# Now this block is pop from evictor and ready to write
|
block_id = block.block_id
|
||||||
# with new content which most probably different with
|
assert block_id is not None
|
||||||
# original content. So need to tell worker to recompute
|
|
||||||
# its kvcache
|
|
||||||
block = self._create_block(
|
|
||||||
prev_block=prev_block,
|
|
||||||
token_ids=[],
|
|
||||||
block_size=self._block_size,
|
|
||||||
allocator=self,
|
|
||||||
block_id=block_id,
|
|
||||||
computed=False,
|
|
||||||
)
|
|
||||||
assert block.content_hash is None
|
|
||||||
|
|
||||||
assert block.block_id not in self._blocks
|
refcount = self._refcounter.decr(block_id)
|
||||||
assert block.block_id is not None
|
if refcount > 0:
|
||||||
self._blocks[block.block_id] = block
|
block.block_id = None
|
||||||
return block
|
return
|
||||||
|
else:
|
||||||
|
assert refcount == 0
|
||||||
|
|
||||||
|
# No longer used
|
||||||
|
assert block.content_hash in self._cached_blocks
|
||||||
|
|
||||||
|
# Add the cached block to the evictor
|
||||||
|
# (This keeps the cached block around so it can be reused)
|
||||||
|
self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
|
||||||
|
self._block_tracker[block_id].last_accessed)
|
||||||
|
|
||||||
|
# Stop tracking the block
|
||||||
|
self._untrack_block_id(block_id)
|
||||||
|
|
||||||
|
block.block_id = None
|
||||||
|
|
||||||
|
def _decr_refcount_hashless_block(self, block: Block) -> None:
|
||||||
|
block_id = block.block_id
|
||||||
|
assert block_id is not None
|
||||||
|
|
||||||
|
# We may have a fork case where block is shared,
|
||||||
|
# in which case, we cannot remove it from tracking
|
||||||
|
refcount = self._refcounter.get(block_id)
|
||||||
|
if refcount == 1:
|
||||||
|
self._untrack_block_id(block_id)
|
||||||
|
|
||||||
|
# Decrement refcount of the block_id, but do not free the block object
|
||||||
|
# itself (will be handled by the caller)
|
||||||
|
self._hashless_allocator.free(block, keep_block_object=True)
|
||||||
|
|
||||||
|
def _allocate_block_id(self) -> BlockId:
|
||||||
|
"""First tries to allocate a block id from the hashless allocator,
|
||||||
|
and if there are no blocks, then tries to evict an unused cached block.
|
||||||
|
"""
|
||||||
|
hashless_block_id = self._maybe_allocate_hashless_block_id()
|
||||||
|
if hashless_block_id is not None:
|
||||||
|
return hashless_block_id
|
||||||
|
|
||||||
|
evicted_block_id = self._maybe_allocate_evicted_block_id()
|
||||||
|
if evicted_block_id is not None:
|
||||||
|
return evicted_block_id
|
||||||
|
|
||||||
# No block available in hashless allocator, nor in unused cache blocks.
|
# No block available in hashless allocator, nor in unused cache blocks.
|
||||||
raise BlockAllocator.NoFreeBlocksError()
|
raise BlockAllocator.NoFreeBlocksError()
|
||||||
|
|
||||||
def _incr_refcount_cached_block(self, block: Block,
|
def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
|
||||||
block_id: BlockId) -> None:
|
try:
|
||||||
# now _incr_refcount_cached_block comes from two place
|
# Allocate mutable block and extract its block_id
|
||||||
# allocate_immutable/promote_to_immutable_block where hit
|
block = self._hashless_allocator.allocate_mutable_block(
|
||||||
# _cached_blocks hash key.
|
prev_block=None)
|
||||||
# In both cases, it means that already exists a already
|
block_id = block.block_id
|
||||||
# computed block which shared with block now
|
self._block_pool.free_block(block)
|
||||||
block.computed = True
|
|
||||||
|
|
||||||
refcount = self._refcounter.incr(block_id)
|
self._track_block_id(block_id, computed=False)
|
||||||
if refcount == 1:
|
return block_id
|
||||||
# if block get referred, then it shall not be in evictor
|
except BlockAllocator.NoFreeBlocksError:
|
||||||
# and put it into _blocks for tracking
|
return None
|
||||||
if block_id in self.evictor:
|
|
||||||
self.evictor.remove(block_id)
|
|
||||||
self._blocks[block_id] = block
|
|
||||||
|
|
||||||
def free(self, block: Block) -> None:
|
def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
|
||||||
"""Decrement the refcount of the block. If the decremented refcount is
|
if self.evictor.num_blocks == 0:
|
||||||
zero, store the block in the freelist.
|
return None
|
||||||
|
|
||||||
If the block has a content hash (meaning it is immutable), then we will
|
# Here we get an evicted block, which is only added
|
||||||
keep the block around in case future allocations require it.
|
# into evictor if its ref counter is 0
|
||||||
|
# and since its content would be changed, we need
|
||||||
|
# to remove it from _cached_blocks's tracking list
|
||||||
|
block_id, content_hash_to_evict = self.evictor.evict()
|
||||||
|
|
||||||
|
# Sanity checks
|
||||||
|
assert content_hash_to_evict in self._cached_blocks
|
||||||
|
_block_id = self._cached_blocks[content_hash_to_evict]
|
||||||
|
assert self._refcounter.get(_block_id) == 0
|
||||||
|
assert _block_id == block_id
|
||||||
|
|
||||||
|
self._cached_blocks.pop(content_hash_to_evict)
|
||||||
|
|
||||||
|
self._refcounter.incr(block_id)
|
||||||
|
self._track_block_id(block_id, computed=False)
|
||||||
|
|
||||||
|
return block_id
|
||||||
|
|
||||||
|
def _free_block_id(self, block: Block) -> None:
|
||||||
|
"""Decrements the refcount of the block. The block may be in two
|
||||||
|
possible states: (1) immutable/cached or (2) mutable/hashless.
|
||||||
|
In the first case, the refcount is decremented directly and the block
|
||||||
|
may be possibly added to the evictor. In other case, hashless
|
||||||
|
allocator free(..) with keep_block_object=True is called to only free
|
||||||
|
the block id (since the block object may be reused by the caller)
|
||||||
"""
|
"""
|
||||||
assert (block.block_id
|
block_id = block.block_id
|
||||||
is not None), "freeing unallocated block is undefined"
|
assert block_id is not None, "Freeing unallocated block is undefined"
|
||||||
|
|
||||||
self._free_block_id_for_block(block.block_id, block)
|
if block.content_hash is not None:
|
||||||
|
# Immutable: This type of block is always cached, and we want to
|
||||||
|
# keep it in the evictor for future reuse
|
||||||
|
self._decr_refcount_cached_block(block)
|
||||||
|
else:
|
||||||
|
# Mutable: This type of block is not cached, so we release it
|
||||||
|
# directly to the hashless allocator
|
||||||
|
self._decr_refcount_hashless_block(block)
|
||||||
|
|
||||||
block.block_id = None
|
assert block.block_id is None
|
||||||
|
|
||||||
def _free_block_id_for_block(self, block_id: BlockId,
|
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
||||||
block: Block) -> None:
|
"""Release the block (look at free_block_id(..) docs)
|
||||||
assert isinstance(block, PrefixCachingBlock)
|
"""
|
||||||
|
# Release the physical block index
|
||||||
|
self._free_block_id(block)
|
||||||
|
|
||||||
# if we comes from promote_to_immutable_block, it means that
|
# Release the block object to the pool
|
||||||
# block.content_hash is never None.
|
if not keep_block_object:
|
||||||
# However we need to release the same content block, so that
|
self._block_pool.free_block(block)
|
||||||
# physical block could get reused.
|
|
||||||
if block.block_id != block_id or block.content_hash is None:
|
|
||||||
refcount = self._refcounter.get(block_id)
|
|
||||||
# We have fork case where block would get more than one ref,
|
|
||||||
# so we cannot free it from tracking if ref cnt large than 1
|
|
||||||
assert block.block_id is not None
|
|
||||||
refcount = self._refcounter.get(block.block_id)
|
|
||||||
if refcount == 1:
|
|
||||||
del self._blocks[block.block_id]
|
|
||||||
|
|
||||||
return self._hashless_allocator.free(block)
|
|
||||||
|
|
||||||
refcount = self._refcounter.decr(block_id)
|
|
||||||
|
|
||||||
# If no longer used, add the block to the evictor.
|
|
||||||
if refcount == 0:
|
|
||||||
assert block.content_hash in self._cached_blocks
|
|
||||||
assert block.block_id is not None
|
|
||||||
del self._blocks[block.block_id]
|
|
||||||
self.evictor.add(block.block_id, block.content_hash,
|
|
||||||
block.num_tokens_total, block.last_accessed)
|
|
||||||
|
|
||||||
def fork(self, last_block: Block) -> List[Block]:
|
def fork(self, last_block: Block) -> List[Block]:
|
||||||
"""Creates a new sequence of blocks that shares the same underlying
|
"""Creates a new sequence of blocks that shares the same underlying
|
||||||
@ -274,17 +359,20 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
forked_blocks: List[Block] = []
|
forked_blocks: List[Block] = []
|
||||||
prev_block = None
|
prev_block = None
|
||||||
for block in source_blocks:
|
for block in source_blocks:
|
||||||
refcount = self._refcounter.incr(block.block_id)
|
block_id = block.block_id
|
||||||
assert refcount != 1, "can't fork free'd block"
|
assert block_id is not None
|
||||||
|
|
||||||
forked_blocks.append(
|
refcount = self._refcounter.incr(block_id)
|
||||||
self._create_block(
|
assert refcount != 1, "can't fork free'd block_id = {}".format(
|
||||||
prev_block=prev_block,
|
block_id)
|
||||||
token_ids=block.token_ids,
|
|
||||||
block_id=block.block_id,
|
forked_block = self._block_pool.init_block(
|
||||||
block_size=self._block_size,
|
prev_block=prev_block,
|
||||||
allocator=self,
|
token_ids=block.token_ids,
|
||||||
))
|
block_size=self._block_size,
|
||||||
|
physical_block_id=block_id)
|
||||||
|
|
||||||
|
forked_blocks.append(forked_block)
|
||||||
prev_block = forked_blocks[-1]
|
prev_block = forked_blocks[-1]
|
||||||
|
|
||||||
return forked_blocks
|
return forked_blocks
|
||||||
@ -329,7 +417,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
|
|
||||||
Note that if we already have a cached block with the same content, we
|
Note that if we already have a cached block with the same content, we
|
||||||
will replace the newly-promoted block's mapping with the existing cached
|
will replace the newly-promoted block's mapping with the existing cached
|
||||||
block.
|
block id.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
block: The mutable block to be promoted.
|
block: The mutable block to be promoted.
|
||||||
@ -338,23 +426,30 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
BlockId: Either the original block index, or the block index of
|
BlockId: Either the original block index, or the block index of
|
||||||
the previously cached block matching the same content.
|
the previously cached block matching the same content.
|
||||||
"""
|
"""
|
||||||
|
# Ensure block can be promoted
|
||||||
assert block.content_hash is not None
|
assert block.content_hash is not None
|
||||||
assert block.block_id is not None
|
assert block.block_id is not None
|
||||||
assert self._refcounter.get(block.block_id) > 0
|
assert self._refcounter.get(block.block_id) > 0
|
||||||
|
|
||||||
# If the content hash does not have a corresponding cached block,
|
|
||||||
# set this block as the cached block.
|
|
||||||
if block.content_hash not in self._cached_blocks:
|
if block.content_hash not in self._cached_blocks:
|
||||||
|
# No cached content hash => Set this block as cached
|
||||||
|
# (Note that this block is not computed yet =>
|
||||||
|
# Will be computed after free())
|
||||||
self._cached_blocks[block.content_hash] = block.block_id
|
self._cached_blocks[block.content_hash] = block.block_id
|
||||||
else:
|
return block.block_id
|
||||||
self._free_block_id_for_block(
|
|
||||||
self._cached_blocks[block.content_hash], block)
|
|
||||||
self._incr_refcount_cached_block(
|
|
||||||
block, self._cached_blocks[block.content_hash])
|
|
||||||
|
|
||||||
return self._cached_blocks[block.content_hash]
|
# Reuse the cached content hash
|
||||||
|
self._decr_refcount_hashless_block(block)
|
||||||
|
block.block_id = self._cached_blocks[block.content_hash]
|
||||||
|
|
||||||
def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
|
# Increment refcount of the cached block and (possibly) restore
|
||||||
|
# it from the evictor.
|
||||||
|
# Note that in this case, the block is marked as computed
|
||||||
|
self._incr_refcount_cached_block(block)
|
||||||
|
|
||||||
|
return block.block_id
|
||||||
|
|
||||||
|
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||||
"""Performs a copy-on-write operation on the given block if it is not
|
"""Performs a copy-on-write operation on the given block if it is not
|
||||||
appendable.
|
appendable.
|
||||||
|
|
||||||
@ -362,11 +457,22 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
block (Block): The block to check for copy-on-write.
|
block (Block): The block to check for copy-on-write.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Optional[BlockId]: The block index of the new block if a copy-on
|
BlockId: The block index of the new block if a copy-on-write
|
||||||
-write operation was performed, or the original block index if
|
operation was performed, or the original block index if
|
||||||
no copy-on-write was necessary.
|
no copy-on-write was necessary.
|
||||||
"""
|
"""
|
||||||
return self._cow_tracker.cow_block_if_not_appendable(block)
|
src_block_id = block.block_id
|
||||||
|
assert src_block_id is not None
|
||||||
|
|
||||||
|
if self._cow_tracker.is_appendable(block):
|
||||||
|
return src_block_id
|
||||||
|
|
||||||
|
self._free_block_id(block)
|
||||||
|
trg_block_id = self._allocate_block_id()
|
||||||
|
|
||||||
|
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
||||||
|
|
||||||
|
return trg_block_id
|
||||||
|
|
||||||
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
||||||
"""Returns the copy-on-write source->destination mapping and clears it.
|
"""Returns the copy-on-write source->destination mapping and clears it.
|
||||||
@ -386,8 +492,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
for block_id in block_ids:
|
for block_id in block_ids:
|
||||||
if block_id in self._blocks:
|
if self._block_tracker[block_id].active:
|
||||||
self._blocks[block_id].last_accessed = now
|
self._block_tracker[block_id].last_accessed = now
|
||||||
elif block_id in self.evictor:
|
elif block_id in self.evictor:
|
||||||
self.evictor.update(block_id, now)
|
self.evictor.update(block_id, now)
|
||||||
else:
|
else:
|
||||||
@ -395,25 +501,46 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
"Mark block as accessed which is not belonged to GPU")
|
"Mark block as accessed which is not belonged to GPU")
|
||||||
|
|
||||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||||
"""Mark blocks as computed, used in prefix caching."""
|
raise NotImplementedError("Marking as computed is incremental")
|
||||||
|
|
||||||
for block_id in block_ids:
|
def _track_block_id(self, block_id: Optional[BlockId],
|
||||||
if block_id in self._blocks:
|
computed: bool) -> None:
|
||||||
# only those full block is valid for prefix caching
|
assert block_id is not None
|
||||||
if self._blocks[block_id].is_full:
|
self._block_tracker[block_id].enable()
|
||||||
self._blocks[block_id].computed = True
|
self._block_tracker[block_id].computed = computed
|
||||||
elif block_id not in self.evictor:
|
|
||||||
raise ValueError(f"Mark {block_id=} as computed which "
|
def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
|
||||||
"is not belonged to GPU")
|
assert block_id is not None
|
||||||
|
self._block_tracker[block_id].disable()
|
||||||
|
|
||||||
def block_is_computed(self, block_id: int) -> bool:
|
def block_is_computed(self, block_id: int) -> bool:
|
||||||
if block_id in self._blocks:
|
if self._block_tracker[block_id].active:
|
||||||
return self._blocks[block_id].computed
|
return self._block_tracker[block_id].computed
|
||||||
else:
|
else:
|
||||||
return block_id in self.evictor
|
return block_id in self.evictor
|
||||||
|
|
||||||
|
def get_computed_block_ids(self,
|
||||||
|
prev_computed_block_ids: List[int],
|
||||||
|
block_ids: List[int],
|
||||||
|
skip_last_block_id: bool = True) -> List[int]:
|
||||||
|
prev_prefix_size = len(prev_computed_block_ids)
|
||||||
|
cur_size = len(block_ids)
|
||||||
|
if skip_last_block_id:
|
||||||
|
cur_size -= 1
|
||||||
|
|
||||||
|
# Sanity checks
|
||||||
|
assert cur_size >= 0
|
||||||
|
assert prev_prefix_size <= cur_size
|
||||||
|
|
||||||
|
ret = prev_computed_block_ids
|
||||||
|
for i in range(prev_prefix_size, cur_size):
|
||||||
|
block_id = block_ids[i]
|
||||||
|
if self.block_is_computed(block_id):
|
||||||
|
ret.append(block_id)
|
||||||
|
return ret
|
||||||
|
|
||||||
def get_common_computed_block_ids(
|
def get_common_computed_block_ids(
|
||||||
self, seq_block_ids: List[List[int]]) -> List[int]:
|
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||||
"""Return the block ids that are common for a given sequence group.
|
"""Return the block ids that are common for a given sequence group.
|
||||||
|
|
||||||
Only those blocks that are immutable and already be marked
|
Only those blocks that are immutable and already be marked
|
||||||
@ -424,14 +551,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
# prompt is cached. This would cause erroneous behavior in model
|
# prompt is cached. This would cause erroneous behavior in model
|
||||||
# runner.
|
# runner.
|
||||||
|
|
||||||
ids_list = [
|
|
||||||
list(
|
|
||||||
takewhile(lambda block_id: self.block_is_computed(block_id),
|
|
||||||
seq[:-1])) for seq in seq_block_ids
|
|
||||||
]
|
|
||||||
# It returns a list of int although type annotation says list of string.
|
# It returns a list of int although type annotation says list of string.
|
||||||
return commonprefix([
|
return commonprefix([
|
||||||
ids for ids in ids_list # type: ignore
|
ids for ids in computed_seq_block_ids # type: ignore
|
||||||
if ids != []
|
if ids != []
|
||||||
])
|
])
|
||||||
|
|
||||||
@ -473,10 +595,10 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
blocks: List of blocks to be swapped out.
|
blocks: List of blocks to be swapped out.
|
||||||
"""
|
"""
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
self.free(block)
|
self._free_block_id(block)
|
||||||
|
|
||||||
def swap_in(self, blocks: List[Block]) -> None:
|
def swap_in(self, blocks: List[Block]) -> None:
|
||||||
"""Execute the swap int actions. Change the block id from
|
"""Execute the swap in actions. Change the block id from
|
||||||
old allocator to current allocator for each block to finish
|
old allocator to current allocator for each block to finish
|
||||||
the block table update.
|
the block table update.
|
||||||
|
|
||||||
@ -484,13 +606,22 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
blocks: List of blocks to be swapped in.
|
blocks: List of blocks to be swapped in.
|
||||||
"""
|
"""
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
|
# Here we allocate either immutable or mutable block and then
|
||||||
|
# extract its block_id. Note that the block object is released
|
||||||
|
# and the block_id is assigned to "block" to allow reusing the
|
||||||
|
# existing "block" object
|
||||||
if block.is_full:
|
if block.is_full:
|
||||||
alloc = self.allocate_immutable(block.prev_block,
|
tmp_block = self.allocate_immutable_block(
|
||||||
block.token_ids)
|
prev_block=block.prev_block, token_ids=block.token_ids)
|
||||||
else:
|
else:
|
||||||
alloc = self.allocate_mutable(block.prev_block)
|
tmp_block = self.allocate_mutable_block(
|
||||||
alloc.append_token_ids(block.token_ids)
|
prev_block=block.prev_block)
|
||||||
block.block_id = alloc.block_id
|
tmp_block.append_token_ids(block.token_ids)
|
||||||
|
|
||||||
|
block_id = tmp_block.block_id
|
||||||
|
self._block_pool.free_block(tmp_block)
|
||||||
|
|
||||||
|
block.block_id = block_id # Assign block_id
|
||||||
|
|
||||||
|
|
||||||
class PrefixCachingBlock(Block):
|
class PrefixCachingBlock(Block):
|
||||||
@ -507,7 +638,7 @@ class PrefixCachingBlock(Block):
|
|||||||
token_ids (List[int]): The initial token IDs to be stored in the block.
|
token_ids (List[int]): The initial token IDs to be stored in the block.
|
||||||
block_size (int): The maximum number of token IDs that can be stored in
|
block_size (int): The maximum number of token IDs that can be stored in
|
||||||
the block.
|
the block.
|
||||||
prefix_caching_allocator (BlockAllocator): The prefix
|
allocator (BlockAllocator): The prefix
|
||||||
caching block allocator associated with this block.
|
caching block allocator associated with this block.
|
||||||
block_id (Optional[int], optional): The physical block index
|
block_id (Optional[int], optional): The physical block index
|
||||||
of this block. Defaults to None.
|
of this block. Defaults to None.
|
||||||
@ -518,31 +649,55 @@ class PrefixCachingBlock(Block):
|
|||||||
prev_block: Optional[Block],
|
prev_block: Optional[Block],
|
||||||
token_ids: List[int],
|
token_ids: List[int],
|
||||||
block_size: int,
|
block_size: int,
|
||||||
prefix_caching_allocator: BlockAllocator,
|
allocator: BlockAllocator,
|
||||||
block_id: Optional[int] = None,
|
block_id: Optional[int] = None,
|
||||||
computed: bool = False,
|
computed: bool = False,
|
||||||
):
|
):
|
||||||
assert isinstance(prefix_caching_allocator,
|
assert isinstance(allocator, PrefixCachingBlockAllocator), (
|
||||||
PrefixCachingBlockAllocator), (
|
"Currently this class is only tested with "
|
||||||
"Currently this class is only tested with "
|
"PrefixCachingBlockAllocator. Got instead allocator = {}".format(
|
||||||
"PrefixCachingBlockAllocator.")
|
allocator))
|
||||||
assert_prefix_caching_block_or_none(prev_block)
|
assert_prefix_caching_block_or_none(prev_block)
|
||||||
|
|
||||||
self._prev_block = prev_block
|
self._prev_block = prev_block
|
||||||
self._cached_content_hash: Optional[int] = None
|
self._cached_content_hash: Optional[int] = None
|
||||||
self._cached_num_tokens_total: Optional[int] = None
|
self._cached_num_tokens_total: int = 0
|
||||||
self._prefix_caching_allocator = prefix_caching_allocator
|
self._allocator = allocator
|
||||||
self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
||||||
self._computed = computed
|
self._computed = computed
|
||||||
|
|
||||||
self._block = NaiveBlock(
|
# On the first time, we create the block object, and next we only
|
||||||
prev_block=prev_block,
|
# reinitialize it
|
||||||
token_ids=token_ids,
|
if hasattr(self, "_block"):
|
||||||
block_size=block_size,
|
self._block.__init__( # type: ignore[has-type]
|
||||||
block_id=block_id,
|
prev_block=prev_block,
|
||||||
allocator=prefix_caching_allocator,
|
token_ids=token_ids,
|
||||||
_cow_target=self,
|
block_size=block_size,
|
||||||
)
|
block_id=block_id,
|
||||||
|
allocator=self._allocator)
|
||||||
|
else:
|
||||||
|
self._block = NaiveBlock(prev_block=prev_block,
|
||||||
|
token_ids=token_ids,
|
||||||
|
block_size=block_size,
|
||||||
|
block_id=block_id,
|
||||||
|
allocator=self._allocator)
|
||||||
|
|
||||||
|
self._update_num_tokens_total()
|
||||||
|
|
||||||
|
def _update_num_tokens_total(self):
|
||||||
|
"""Incrementally computes the number of tokens that there is
|
||||||
|
till the current block (included)
|
||||||
|
"""
|
||||||
|
res = 0
|
||||||
|
|
||||||
|
# Add all previous blocks
|
||||||
|
if self._prev_block is not None:
|
||||||
|
res += self._prev_block.num_tokens_total
|
||||||
|
|
||||||
|
# Add current block
|
||||||
|
res += len(self.token_ids)
|
||||||
|
|
||||||
|
self._cached_num_tokens_total = res
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def computed(self) -> bool:
|
def computed(self) -> bool:
|
||||||
@ -564,22 +719,28 @@ class PrefixCachingBlock(Block):
|
|||||||
"""Appends the given token IDs to the block and registers the block as
|
"""Appends the given token IDs to the block and registers the block as
|
||||||
immutable if the block becomes full.
|
immutable if the block becomes full.
|
||||||
|
|
||||||
Internally, the naive block handles CoW.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids (List[int]): The token IDs to be appended to the block.
|
token_ids (List[int]): The token IDs to be appended to the block.
|
||||||
"""
|
"""
|
||||||
assert token_ids
|
# Ensure this is mutable block (not promoted)
|
||||||
|
assert self.content_hash is None
|
||||||
|
assert not self.computed
|
||||||
|
|
||||||
# naive block handles CoW.
|
if len(token_ids) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ensure there are input tokens
|
||||||
|
assert token_ids, "Got token_ids = {}".format(token_ids)
|
||||||
|
|
||||||
|
# Naive block handles CoW.
|
||||||
self._block.append_token_ids(token_ids)
|
self._block.append_token_ids(token_ids)
|
||||||
|
self._update_num_tokens_total()
|
||||||
|
|
||||||
# If the content hash is present, then the block can be made immutable.
|
# If the content hash is present, then the block can be made immutable.
|
||||||
# Register ourselves with the allocator, potentially replacing the
|
# Register ourselves with the allocator, potentially replacing the
|
||||||
# physical block index.
|
# physical block index.
|
||||||
if self.content_hash is not None:
|
if self.content_hash is not None:
|
||||||
self.block_id = (self._prefix_caching_allocator.
|
self.block_id = self._allocator.promote_to_immutable_block(self)
|
||||||
promote_to_immutable_block(self))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def block_id(self) -> Optional[int]:
|
def block_id(self) -> Optional[int]:
|
||||||
@ -599,23 +760,6 @@ class PrefixCachingBlock(Block):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def num_tokens_total(self) -> int:
|
def num_tokens_total(self) -> int:
|
||||||
"""return the total tokens so far.
|
|
||||||
|
|
||||||
Here we iterate the block chain till to the first block, while
|
|
||||||
cache the result in local to prevent repeated computations.
|
|
||||||
"""
|
|
||||||
if self._cached_num_tokens_total is not None:
|
|
||||||
return self._cached_num_tokens_total
|
|
||||||
|
|
||||||
_block: Optional[Block] = self
|
|
||||||
self._cached_num_tokens_total = 0
|
|
||||||
|
|
||||||
# TODO: current implement here take O(N^2), we expect future
|
|
||||||
# we have O(1) here
|
|
||||||
while _block is not None:
|
|
||||||
self._cached_num_tokens_total += len(_block.token_ids)
|
|
||||||
_block = _block.prev_block
|
|
||||||
|
|
||||||
return self._cached_num_tokens_total
|
return self._cached_num_tokens_total
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -638,7 +782,6 @@ class PrefixCachingBlock(Block):
|
|||||||
For the content-based hash to be defined, the current block must be
|
For the content-based hash to be defined, the current block must be
|
||||||
full.
|
full.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# If the hash is already computed, return it.
|
# If the hash is already computed, return it.
|
||||||
if self._cached_content_hash is not None:
|
if self._cached_content_hash is not None:
|
||||||
return self._cached_content_hash
|
return self._cached_content_hash
|
||||||
@ -688,7 +831,129 @@ class PrefixCachingBlock(Block):
|
|||||||
return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
|
return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
|
||||||
|
|
||||||
|
|
||||||
|
class ComputedBlocksTracker:
|
||||||
|
"""Handles caching of per-sequence computed block ids.
|
||||||
|
When a sequence appears for the first time, it traverses all of the
|
||||||
|
blocks and detects the prefix of blocks that is computed. On the
|
||||||
|
subsequent times, it only traverses the new blocks that were added
|
||||||
|
and updates the already recorded prefix of blocks with the newly
|
||||||
|
computed blocks.
|
||||||
|
|
||||||
|
To avoid redundant traversals, the algorithm also detects when there
|
||||||
|
is a "gap" in the computed prefix. For example, if we have blocks =
|
||||||
|
[1,2,3,4,5], and we have detected [1,2,3] as the computed prefix, then
|
||||||
|
we won't try to add more computed blocks to [1,2,3] in this sequence
|
||||||
|
iteration, and will add more computed blocks only after the sequence is
|
||||||
|
freed and reused again.
|
||||||
|
|
||||||
|
Note that currently, for a given sequence, we also skip the last
|
||||||
|
block id for caching purposes, to avoid caching of a full sequence
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, allocator):
|
||||||
|
self._allocator = allocator
|
||||||
|
self._cached_computed_seq_blocks: Dict[int, Tuple[List[int],
|
||||||
|
bool]] = {}
|
||||||
|
|
||||||
|
def add_seq(self, seq_id: int) -> None:
|
||||||
|
"""Start tracking seq_id
|
||||||
|
"""
|
||||||
|
assert seq_id not in self._cached_computed_seq_blocks
|
||||||
|
self._cached_computed_seq_blocks[seq_id] = ([], False)
|
||||||
|
|
||||||
|
def remove_seq(self, seq_id: int) -> None:
|
||||||
|
"""Stop tracking seq_id
|
||||||
|
"""
|
||||||
|
assert seq_id in self._cached_computed_seq_blocks
|
||||||
|
del self._cached_computed_seq_blocks[seq_id]
|
||||||
|
|
||||||
|
def get_cached_computed_blocks_and_update(
|
||||||
|
self, seq_id: int, block_ids: List[int]) -> List[int]:
|
||||||
|
""" Look at the class documentation for details
|
||||||
|
"""
|
||||||
|
# Ensure seq_id is already tracked
|
||||||
|
assert seq_id in self._cached_computed_seq_blocks
|
||||||
|
|
||||||
|
# Get cached data (may be empty on the first time)
|
||||||
|
prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
|
||||||
|
seq_id]
|
||||||
|
|
||||||
|
if has_gap:
|
||||||
|
# When gap is detected, we do not add more computed blocks at this
|
||||||
|
# sequence iteration
|
||||||
|
return prev_computed_block_ids
|
||||||
|
|
||||||
|
# We do not consider the last block id for caching purposes.
|
||||||
|
num_cur_blocks = len(block_ids) - 1
|
||||||
|
assert num_cur_blocks >= 0
|
||||||
|
|
||||||
|
if len(prev_computed_block_ids) >= num_cur_blocks:
|
||||||
|
# Cache HIT
|
||||||
|
assert len(prev_computed_block_ids) == num_cur_blocks
|
||||||
|
return prev_computed_block_ids
|
||||||
|
|
||||||
|
# If here, then we may possibly add more computed blocks. As a result,
|
||||||
|
# traverse the additional blocks after prev_computed_block_ids to
|
||||||
|
# detect more computed blocks and add them.
|
||||||
|
|
||||||
|
# Incremental init for seq_id => Look only at the new blocks
|
||||||
|
computed_block_ids = self._allocator.get_computed_block_ids( # noqa: E501
|
||||||
|
prev_computed_block_ids,
|
||||||
|
block_ids,
|
||||||
|
skip_last_block_id=
|
||||||
|
True, # We skip last block id to avoid caching of full seq
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect if there is a "gap"
|
||||||
|
has_gap = len(computed_block_ids) < num_cur_blocks
|
||||||
|
|
||||||
|
# Record
|
||||||
|
self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
|
||||||
|
has_gap)
|
||||||
|
|
||||||
|
return computed_block_ids
|
||||||
|
|
||||||
|
|
||||||
|
class LastAccessBlocksTracker:
|
||||||
|
"""Manages the last access time of the tracked sequences, in order to allow
|
||||||
|
an efficient update of allocator's block last access times
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, allocator):
|
||||||
|
self._allocator = allocator
|
||||||
|
self._seq_last_access: Dict[int, Optional[float]] = {}
|
||||||
|
|
||||||
|
def add_seq(self, seq_id: int) -> None:
|
||||||
|
"""Start tracking seq_id
|
||||||
|
"""
|
||||||
|
assert seq_id not in self._seq_last_access
|
||||||
|
self._seq_last_access[seq_id] = None
|
||||||
|
|
||||||
|
def remove_seq(self, seq_id: int) -> None:
|
||||||
|
"""Stop tracking seq_id
|
||||||
|
"""
|
||||||
|
assert seq_id in self._seq_last_access
|
||||||
|
del self._seq_last_access[seq_id]
|
||||||
|
|
||||||
|
def update_last_access(self, seq_id: int, time: float) -> None:
|
||||||
|
assert seq_id in self._seq_last_access
|
||||||
|
self._seq_last_access[seq_id] = time
|
||||||
|
|
||||||
|
def update_seq_blocks_last_access(self, seq_id: int,
|
||||||
|
block_ids: List[int]) -> None:
|
||||||
|
assert seq_id in self._seq_last_access
|
||||||
|
|
||||||
|
ts = self._seq_last_access[seq_id]
|
||||||
|
|
||||||
|
if ts is None:
|
||||||
|
# No last access was recorded, no need to update.
|
||||||
|
return
|
||||||
|
|
||||||
|
self._allocator.mark_blocks_as_accessed(block_ids, ts)
|
||||||
|
|
||||||
|
|
||||||
def assert_prefix_caching_block_or_none(block: Optional[Block]):
|
def assert_prefix_caching_block_or_none(block: Optional[Block]):
|
||||||
if block is None:
|
if block is None:
|
||||||
return
|
return
|
||||||
assert isinstance(block, PrefixCachingBlock)
|
assert isinstance(block,
|
||||||
|
PrefixCachingBlock), "Got block = {}".format(block)
|
||||||
|
@ -7,6 +7,8 @@ from typing import Tuple
|
|||||||
from vllm.core.block.block_table import BlockTable
|
from vllm.core.block.block_table import BlockTable
|
||||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||||
from vllm.core.block.interfaces import Block
|
from vllm.core.block.interfaces import Block
|
||||||
|
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
|
||||||
|
LastAccessBlocksTracker)
|
||||||
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
|
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
|
||||||
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
||||||
@ -100,6 +102,11 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
self.block_tables: Dict[SeqId, BlockTable] = {}
|
self.block_tables: Dict[SeqId, BlockTable] = {}
|
||||||
self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
|
self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
|
||||||
|
|
||||||
|
self._computed_blocks_tracker = ComputedBlocksTracker(
|
||||||
|
self.block_allocator)
|
||||||
|
self._last_access_blocks_tracker = LastAccessBlocksTracker(
|
||||||
|
self.block_allocator)
|
||||||
|
|
||||||
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
|
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
|
||||||
# FIXME(woosuk): Here we assume that all sequences in the group share
|
# FIXME(woosuk): Here we assume that all sequences in the group share
|
||||||
# the same prompt. This may not be true for preempted sequences.
|
# the same prompt. This may not be true for preempted sequences.
|
||||||
@ -157,10 +164,18 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
block_table: BlockTable = self._allocate_sequence(seq)
|
block_table: BlockTable = self._allocate_sequence(seq)
|
||||||
self.block_tables[seq.seq_id] = block_table
|
self.block_tables[seq.seq_id] = block_table
|
||||||
|
|
||||||
|
# Track seq
|
||||||
|
self._computed_blocks_tracker.add_seq(seq.seq_id)
|
||||||
|
self._last_access_blocks_tracker.add_seq(seq.seq_id)
|
||||||
|
|
||||||
# Assign the block table for each sequence.
|
# Assign the block table for each sequence.
|
||||||
for seq in waiting_seqs[1:]:
|
for seq in waiting_seqs[1:]:
|
||||||
self.block_tables[seq.seq_id] = block_table.fork()
|
self.block_tables[seq.seq_id] = block_table.fork()
|
||||||
|
|
||||||
|
# Track seq
|
||||||
|
self._computed_blocks_tracker.add_seq(seq.seq_id)
|
||||||
|
self._last_access_blocks_tracker.add_seq(seq.seq_id)
|
||||||
|
|
||||||
# Allocate cross-attention block table for encoder sequence
|
# Allocate cross-attention block table for encoder sequence
|
||||||
#
|
#
|
||||||
# NOTE: Here we assume that all sequences in the group have the same
|
# NOTE: Here we assume that all sequences in the group have the same
|
||||||
@ -224,11 +239,23 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
return new_cows
|
return new_cows
|
||||||
|
|
||||||
def free(self, seq: Sequence) -> None:
|
def free(self, seq: Sequence) -> None:
|
||||||
if seq.seq_id not in self.block_tables:
|
seq_id = seq.seq_id
|
||||||
|
|
||||||
|
if seq_id not in self.block_tables:
|
||||||
# Already freed or haven't been scheduled yet.
|
# Already freed or haven't been scheduled yet.
|
||||||
return
|
return
|
||||||
self.block_tables[seq.seq_id].free()
|
|
||||||
del self.block_tables[seq.seq_id]
|
# Update seq block ids with the latest access time
|
||||||
|
self._last_access_blocks_tracker.update_seq_blocks_last_access(
|
||||||
|
seq_id, self.block_tables[seq.seq_id].physical_block_ids)
|
||||||
|
|
||||||
|
# Untrack seq
|
||||||
|
self._last_access_blocks_tracker.remove_seq(seq_id)
|
||||||
|
self._computed_blocks_tracker.remove_seq(seq_id)
|
||||||
|
|
||||||
|
# Free table/blocks
|
||||||
|
self.block_tables[seq_id].free()
|
||||||
|
del self.block_tables[seq_id]
|
||||||
|
|
||||||
def free_cross(self, seq_group: SequenceGroup) -> None:
|
def free_cross(self, seq_group: SequenceGroup) -> None:
|
||||||
request_id = seq_group.request_id
|
request_id = seq_group.request_id
|
||||||
@ -239,9 +266,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
del self.cross_block_tables[request_id]
|
del self.cross_block_tables[request_id]
|
||||||
|
|
||||||
def get_block_table(self, seq: Sequence) -> List[int]:
|
def get_block_table(self, seq: Sequence) -> List[int]:
|
||||||
assert seq.seq_id in self.block_tables
|
|
||||||
block_ids = self.block_tables[seq.seq_id].physical_block_ids
|
block_ids = self.block_tables[seq.seq_id].physical_block_ids
|
||||||
assert all(b is not None for b in block_ids)
|
|
||||||
return block_ids # type: ignore
|
return block_ids # type: ignore
|
||||||
|
|
||||||
def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
|
def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
|
||||||
@ -252,20 +277,14 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
return block_ids # type: ignore
|
return block_ids # type: ignore
|
||||||
|
|
||||||
def access_all_blocks_in_seq(self, seq: Sequence, now: float):
|
def access_all_blocks_in_seq(self, seq: Sequence, now: float):
|
||||||
# Update the last accessed time of all the blocks accessed
|
|
||||||
# in this step.
|
|
||||||
# And the accessed time is only useful for prefix caching now,
|
|
||||||
# as it support internal evictor policy for which cached
|
|
||||||
# block could be refilled, to keep cached content could be reused
|
|
||||||
# at max extend.
|
|
||||||
if self.enable_caching:
|
if self.enable_caching:
|
||||||
block_table = self.block_tables[seq.seq_id]
|
# Record the latest access time for the sequence. The actual update
|
||||||
block_ids: List[Optional[int]] = []
|
# of the block ids is deferred to the sequence free(..) call, since
|
||||||
for block_id in block_table.physical_block_ids:
|
# only during freeing of block ids, the blocks are actually added to
|
||||||
block_ids.append(block_id)
|
# the evictor (which is when the most updated time is required)
|
||||||
self.block_allocator.mark_blocks_as_accessed(
|
# (This avoids expensive calls to mark_blocks_as_accessed(..))
|
||||||
block_ids, # type: ignore
|
self._last_access_blocks_tracker.update_last_access(
|
||||||
now)
|
seq.seq_id, now)
|
||||||
|
|
||||||
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
|
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
|
||||||
# The only need for mark block as computed is for prefix caching,
|
# The only need for mark block as computed is for prefix caching,
|
||||||
@ -285,17 +304,26 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
This method determines which blocks can be safely skipped for all
|
This method determines which blocks can be safely skipped for all
|
||||||
sequences in the sequence group.
|
sequences in the sequence group.
|
||||||
"""
|
"""
|
||||||
seq_block_ids = [
|
computed_seq_block_ids = []
|
||||||
self.block_tables[seq.seq_id].physical_block_ids for seq in seqs
|
for seq in seqs:
|
||||||
]
|
computed_seq_block_ids.append(
|
||||||
|
self._computed_blocks_tracker.
|
||||||
|
get_cached_computed_blocks_and_update(
|
||||||
|
seq.seq_id,
|
||||||
|
self.block_tables[seq.seq_id].physical_block_ids))
|
||||||
|
|
||||||
# NOTE(sang): This assumes seq_block_ids doesn't contain any None.
|
# NOTE(sang): This assumes seq_block_ids doesn't contain any None.
|
||||||
return self.block_allocator.get_common_computed_block_ids(
|
return self.block_allocator.get_common_computed_block_ids(
|
||||||
seq_block_ids) # type: ignore
|
computed_seq_block_ids) # type: ignore
|
||||||
|
|
||||||
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
||||||
src_block_table = self.block_tables[parent_seq.seq_id]
|
src_block_table = self.block_tables[parent_seq.seq_id]
|
||||||
self.block_tables[child_seq.seq_id] = src_block_table.fork()
|
self.block_tables[child_seq.seq_id] = src_block_table.fork()
|
||||||
|
|
||||||
|
# Track child seq
|
||||||
|
self._computed_blocks_tracker.add_seq(child_seq.seq_id)
|
||||||
|
self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
|
||||||
|
|
||||||
def can_swap_in(self, seq_group: SequenceGroup,
|
def can_swap_in(self, seq_group: SequenceGroup,
|
||||||
num_lookahead_slots: int) -> AllocStatus:
|
num_lookahead_slots: int) -> AllocStatus:
|
||||||
"""Returns the AllocStatus for the given sequence_group
|
"""Returns the AllocStatus for the given sequence_group
|
||||||
@ -323,19 +351,31 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
List[Tuple[int, int]]: The mapping of swapping block from CPU
|
List[Tuple[int, int]]: The mapping of swapping block from CPU
|
||||||
to GPU.
|
to GPU.
|
||||||
"""
|
"""
|
||||||
blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
|
physical_block_id_mapping = []
|
||||||
current_swap_mapping = self.block_allocator.swap(
|
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
|
||||||
blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
|
blocks = self.block_tables[seq.seq_id].blocks
|
||||||
|
if len(blocks) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
block_number_mapping = {
|
seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
|
||||||
self.block_allocator.get_physical_block_id(Device.CPU,
|
src_device=Device.CPU,
|
||||||
cpu_block_id):
|
dst_device=Device.GPU)
|
||||||
self.block_allocator.get_physical_block_id(Device.GPU,
|
|
||||||
gpu_block_id)
|
# Refresh the block ids of the table (post-swap)
|
||||||
for cpu_block_id, gpu_block_id in current_swap_mapping.items()
|
self.block_tables[seq.seq_id].update(blocks)
|
||||||
}
|
|
||||||
# convert to list of tuples once here
|
seq_physical_block_id_mapping = {
|
||||||
return list(block_number_mapping.items())
|
self.block_allocator.get_physical_block_id(
|
||||||
|
Device.CPU, cpu_block_id):
|
||||||
|
self.block_allocator.get_physical_block_id(
|
||||||
|
Device.GPU, gpu_block_id)
|
||||||
|
for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
physical_block_id_mapping.extend(
|
||||||
|
list(seq_physical_block_id_mapping.items()))
|
||||||
|
|
||||||
|
return physical_block_id_mapping
|
||||||
|
|
||||||
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
||||||
"""Returns whether we can swap out the given sequence_group
|
"""Returns whether we can swap out the given sequence_group
|
||||||
@ -355,7 +395,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
|
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||||
"""Returns the block id mapping (from GPU to CPU) generated by
|
"""Returns the block id mapping (from GPU to CPU) generated by
|
||||||
swapping out the given sequence_group with num_lookahead_slots.
|
swapping out the given sequence_group with num_lookahead_slots.
|
||||||
|
|
||||||
@ -366,19 +406,31 @@ class BlockSpaceManagerV2(BlockSpaceManager):
|
|||||||
List[Tuple[int, int]]: The mapping of swapping block from
|
List[Tuple[int, int]]: The mapping of swapping block from
|
||||||
GPU to CPU.
|
GPU to CPU.
|
||||||
"""
|
"""
|
||||||
blocks = self._get_blocks_for_swap(sequence_group,
|
physical_block_id_mapping = []
|
||||||
SequenceStatus.RUNNING)
|
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||||
current_swap_mapping = self.block_allocator.swap(
|
blocks = self.block_tables[seq.seq_id].blocks
|
||||||
blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU)
|
if len(blocks) == 0:
|
||||||
block_number_mapping = {
|
continue
|
||||||
self.block_allocator.get_physical_block_id(Device.GPU,
|
|
||||||
gpu_block_id):
|
seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
|
||||||
self.block_allocator.get_physical_block_id(Device.CPU,
|
src_device=Device.GPU,
|
||||||
cpu_block_id)
|
dst_device=Device.CPU)
|
||||||
for gpu_block_id, cpu_block_id in current_swap_mapping.items()
|
|
||||||
}
|
# Refresh the block ids of the table (post-swap)
|
||||||
# convert to list of tuples once here
|
self.block_tables[seq.seq_id].update(blocks)
|
||||||
return list(block_number_mapping.items())
|
|
||||||
|
seq_physical_block_id_mapping = {
|
||||||
|
self.block_allocator.get_physical_block_id(
|
||||||
|
Device.GPU, gpu_block_id):
|
||||||
|
self.block_allocator.get_physical_block_id(
|
||||||
|
Device.CPU, cpu_block_id)
|
||||||
|
for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
physical_block_id_mapping.extend(
|
||||||
|
list(seq_physical_block_id_mapping.items()))
|
||||||
|
|
||||||
|
return physical_block_id_mapping
|
||||||
|
|
||||||
def get_num_free_gpu_blocks(self) -> int:
|
def get_num_free_gpu_blocks(self) -> int:
|
||||||
return self.block_allocator.get_num_free_blocks(Device.GPU)
|
return self.block_allocator.get_num_free_blocks(Device.GPU)
|
||||||
|
@ -177,7 +177,8 @@ class LLMEngine:
|
|||||||
"enforce_eager=%s, kv_cache_dtype=%s, "
|
"enforce_eager=%s, kv_cache_dtype=%s, "
|
||||||
"quantization_param_path=%s, device_config=%s, "
|
"quantization_param_path=%s, device_config=%s, "
|
||||||
"decoding_config=%r, observability_config=%r, "
|
"decoding_config=%r, observability_config=%r, "
|
||||||
"seed=%d, served_model_name=%s)",
|
"seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
|
||||||
|
"enable_prefix_caching=%s)",
|
||||||
VLLM_VERSION,
|
VLLM_VERSION,
|
||||||
model_config.model,
|
model_config.model,
|
||||||
speculative_config,
|
speculative_config,
|
||||||
@ -204,6 +205,8 @@ class LLMEngine:
|
|||||||
observability_config,
|
observability_config,
|
||||||
model_config.seed,
|
model_config.seed,
|
||||||
model_config.served_model_name,
|
model_config.served_model_name,
|
||||||
|
scheduler_config.use_v2_block_manager,
|
||||||
|
cache_config.enable_prefix_caching,
|
||||||
)
|
)
|
||||||
# TODO(woosuk): Print more configs in debug mode.
|
# TODO(woosuk): Print more configs in debug mode.
|
||||||
|
|
||||||
|
@ -345,7 +345,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
out_logprobs = prompt_logprobs
|
out_logprobs = prompt_logprobs
|
||||||
output_text = prompt_text
|
output_text = prompt_text
|
||||||
elif request.echo and request.max_tokens > 0:
|
elif request.echo and request.max_tokens > 0:
|
||||||
token_ids = prompt_token_ids + output.token_ids
|
token_ids = prompt_token_ids + list(output.token_ids)
|
||||||
out_logprobs = (prompt_logprobs + output.logprobs
|
out_logprobs = (prompt_logprobs + output.logprobs
|
||||||
if request.logprobs is not None else None)
|
if request.logprobs is not None else None)
|
||||||
output_text = prompt_text + output.text
|
output_text = prompt_text + output.text
|
||||||
|
@ -427,8 +427,8 @@ class SamplingTensors:
|
|||||||
if seq_group.do_sample:
|
if seq_group.do_sample:
|
||||||
for seq_id in seq_ids:
|
for seq_id in seq_ids:
|
||||||
seq_data = seq_group.seq_data[seq_id]
|
seq_data = seq_group.seq_data[seq_id]
|
||||||
prompt_tokens.append(seq_data.prompt_token_ids)
|
prompt_tokens.append(list(seq_data.prompt_token_ids))
|
||||||
output_tokens.append(seq_data.output_token_ids)
|
output_tokens.append(list(seq_data.output_token_ids))
|
||||||
|
|
||||||
sampling_tensors = SamplingTensors.from_lists(
|
sampling_tensors = SamplingTensors.from_lists(
|
||||||
temperatures, top_ps, top_ks, min_ps, presence_penalties,
|
temperatures, top_ps, top_ks, min_ps, presence_penalties,
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
|
from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
|
||||||
@ -28,7 +28,7 @@ class CompletionOutput:
|
|||||||
|
|
||||||
index: int
|
index: int
|
||||||
text: str
|
text: str
|
||||||
token_ids: List[int]
|
token_ids: Tuple[int, ...]
|
||||||
cumulative_logprob: float
|
cumulative_logprob: float
|
||||||
logprobs: Optional[SampleLogprobs]
|
logprobs: Optional[SampleLogprobs]
|
||||||
finish_reason: Optional[str] = None
|
finish_reason: Optional[str] = None
|
||||||
|
@ -116,41 +116,66 @@ class SequenceData:
|
|||||||
prompt_token_ids: List[int],
|
prompt_token_ids: List[int],
|
||||||
output_token_ids: Optional[List[int]] = None,
|
output_token_ids: Optional[List[int]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if output_token_ids is None:
|
self._prompt_token_ids: List[int] = list(prompt_token_ids)
|
||||||
output_token_ids = []
|
self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids)
|
||||||
|
self._output_token_ids: List[int] = (
|
||||||
|
list(output_token_ids) if output_token_ids is not None else [])
|
||||||
|
|
||||||
self.prompt_token_ids = prompt_token_ids
|
|
||||||
self._prompt_token_ids_tuple = tuple(prompt_token_ids)
|
|
||||||
self.output_token_ids = output_token_ids
|
|
||||||
self.cumulative_logprob = 0.0
|
self.cumulative_logprob = 0.0
|
||||||
# The number of tokens that are computed (that run against the model).
|
# The number of tokens that are computed (that run against the model).
|
||||||
self._num_computed_tokens = 0
|
self._num_computed_tokens = 0
|
||||||
self._stage: SequenceStage = SequenceStage.PREFILL
|
self._stage: SequenceStage = SequenceStage.PREFILL
|
||||||
|
|
||||||
|
self._update_cached_all_tokens()
|
||||||
|
|
||||||
|
def _update_cached_all_tokens(self):
|
||||||
|
self._cached_all_token_ids: List[int] = (self._prompt_token_ids +
|
||||||
|
self._output_token_ids)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prompt_token_ids(self) -> Tuple[int, ...]:
|
||||||
|
return self._prompt_token_ids_tuple
|
||||||
|
|
||||||
|
@prompt_token_ids.setter
|
||||||
|
def prompt_token_ids(self, new_prompt_token_ids) -> None:
|
||||||
|
self._prompt_token_ids = list(new_prompt_token_ids)
|
||||||
|
self._prompt_token_ids_tuple = tuple(new_prompt_token_ids)
|
||||||
|
self._update_cached_all_tokens()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def output_token_ids(self) -> Tuple[int, ...]:
|
||||||
|
return tuple(self._output_token_ids)
|
||||||
|
|
||||||
|
@output_token_ids.setter
|
||||||
|
def output_token_ids(self, new_output_token_ids) -> None:
|
||||||
|
self._output_token_ids = list(new_output_token_ids)
|
||||||
|
self._update_cached_all_tokens()
|
||||||
|
|
||||||
def append_token_id(self, token_id: int, logprob: float) -> None:
|
def append_token_id(self, token_id: int, logprob: float) -> None:
|
||||||
self.output_token_ids.append(token_id)
|
self._output_token_ids.append(token_id)
|
||||||
|
self._cached_all_token_ids.append(token_id)
|
||||||
self.cumulative_logprob += logprob
|
self.cumulative_logprob += logprob
|
||||||
|
|
||||||
def get_len(self) -> int:
|
def get_len(self) -> int:
|
||||||
return len(self.output_token_ids) + len(self.prompt_token_ids)
|
return len(self._output_token_ids) + len(self._prompt_token_ids)
|
||||||
|
|
||||||
def get_prompt_len(self) -> int:
|
def get_prompt_len(self) -> int:
|
||||||
return len(self.prompt_token_ids)
|
return len(self._prompt_token_ids)
|
||||||
|
|
||||||
def get_output_len(self) -> int:
|
def get_output_len(self) -> int:
|
||||||
return len(self.output_token_ids)
|
return len(self._output_token_ids)
|
||||||
|
|
||||||
def get_token_ids(self) -> List[int]:
|
def get_token_ids(self) -> List[int]:
|
||||||
return self.prompt_token_ids + self.output_token_ids
|
return self._cached_all_token_ids
|
||||||
|
|
||||||
def get_prefix_token_ids(
|
def get_prefix_token_ids(
|
||||||
self, num_tokens: int
|
self, num_tokens: int
|
||||||
) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
|
) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
|
||||||
"""Get prefix tokens, and make the return value hashable"""
|
"""Get prefix tokens, and make the return value hashable"""
|
||||||
prompt_length = len(self.prompt_token_ids)
|
prompt_length = self.get_prompt_len()
|
||||||
if num_tokens > prompt_length:
|
if num_tokens > prompt_length:
|
||||||
return (self._prompt_token_ids_tuple,
|
return (self._prompt_token_ids_tuple,
|
||||||
tuple(self.output_token_ids[:num_tokens - prompt_length]))
|
tuple(self._output_token_ids[:num_tokens - prompt_length]))
|
||||||
else:
|
else:
|
||||||
return (self._prompt_token_ids_tuple[:num_tokens], None)
|
return (self._prompt_token_ids_tuple[:num_tokens], None)
|
||||||
|
|
||||||
@ -183,14 +208,14 @@ class SequenceData:
|
|||||||
return self.get_len() - self.get_num_computed_tokens()
|
return self.get_len() - self.get_num_computed_tokens()
|
||||||
|
|
||||||
def get_last_token_id(self) -> int:
|
def get_last_token_id(self) -> int:
|
||||||
if not self.output_token_ids:
|
if not self._output_token_ids:
|
||||||
return self.prompt_token_ids[-1]
|
return self._prompt_token_ids[-1]
|
||||||
return self.output_token_ids[-1]
|
return self._output_token_ids[-1]
|
||||||
|
|
||||||
def get_prompt_token_ids(self) -> List[int]:
|
def get_prompt_token_ids(self) -> Tuple[int, ...]:
|
||||||
return self.prompt_token_ids
|
return self.prompt_token_ids
|
||||||
|
|
||||||
def get_output_token_ids(self) -> List[int]:
|
def get_output_token_ids(self) -> Tuple[int, ...]:
|
||||||
return self.output_token_ids
|
return self.output_token_ids
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -199,8 +224,8 @@ class SequenceData:
|
|||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return (f"SequenceData("
|
return (f"SequenceData("
|
||||||
f"prompt_token_ids={self.prompt_token_ids}, "
|
f"prompt_token_ids={self._prompt_token_ids}, "
|
||||||
f"output_token_ids={self.output_token_ids}, "
|
f"output_token_ids={self._output_token_ids}, "
|
||||||
f"cumulative_logprob={self.cumulative_logprob})")
|
f"cumulative_logprob={self.cumulative_logprob})")
|
||||||
|
|
||||||
|
|
||||||
@ -306,14 +331,14 @@ class Sequence:
|
|||||||
def get_token_ids(self) -> List[int]:
|
def get_token_ids(self) -> List[int]:
|
||||||
return self.data.get_token_ids()
|
return self.data.get_token_ids()
|
||||||
|
|
||||||
def get_prompt_token_ids(self) -> List[int]:
|
def get_prompt_token_ids(self) -> Tuple[int, ...]:
|
||||||
return self.data.get_prompt_token_ids()
|
return self.data.get_prompt_token_ids()
|
||||||
|
|
||||||
def get_last_token_id(self) -> int:
|
def get_last_token_id(self) -> int:
|
||||||
return self.data.get_last_token_id()
|
return self.data.get_last_token_id()
|
||||||
|
|
||||||
def get_output_token_ids(self) -> List[int]:
|
def get_output_token_ids(self) -> Tuple[int, ...]:
|
||||||
return self.data.output_token_ids
|
return self.data.get_output_token_ids()
|
||||||
|
|
||||||
def get_cumulative_logprob(self) -> float:
|
def get_cumulative_logprob(self) -> float:
|
||||||
return self.data.cumulative_logprob
|
return self.data.cumulative_logprob
|
||||||
|
Loading…
x
Reference in New Issue
Block a user