Fix tests in test_scheduler.py that fail with BlockManager V2 (#8728)

This commit is contained in:
sroy745 2024-09-23 21:43:13 -07:00 committed by GitHub
parent 530821d00c
commit 88577ac928
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,7 +3,8 @@ from collections import deque
from typing import List, Set, Tuple from typing import List, Set, Tuple
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest # noqa import pytest
from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.core.interfaces import AllocStatus from vllm.core.interfaces import AllocStatus
@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens) schedule_and_update_computed_tokens)
def test_scheduler_add_seq_group(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1) scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
# Add seq group to scheduler. # Add seq group to scheduler.
num_seq_group = 4 num_seq_group = 4
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), block_size) _, seq_group = create_dummy_prompt(str(i),
block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
assert scheduler.get_num_unfinished_seq_groups() == i + 1 assert scheduler.get_num_unfinished_seq_groups() == i + 1
def test_scheduler_abort_seq_group(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1) scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
assert scheduler.get_num_unfinished_seq_groups() == 0 assert scheduler.get_num_unfinished_seq_groups() == 0
def test_scheduler_schedule_simple(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_simple(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) scheduler_config = SchedulerConfig(
64,
num_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) _, seq_group = create_dummy_prompt(str(i),
prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
append_new_token(out, 1) append_new_token(out, 1)
def test_scheduler_prefill_prioritized(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
"""Verify running batched tokens are not applied to prefill requests.""" """Verify running batched tokens are not applied to prefill requests."""
block_size = 4 block_size = 4
max_model_len = 30 max_model_len = 30
max_batched_num_tokens = 30 max_batched_num_tokens = 30
scheduler_config = SchedulerConfig(max_batched_num_tokens, 2, scheduler_config = SchedulerConfig(
max_model_len) max_batched_num_tokens,
2,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 2 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler. # Add seq groups to scheduler.
_, seq_group_a = create_dummy_prompt("1", 1) _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
scheduler.add_seq_group(seq_group_a) scheduler.add_seq_group(seq_group_a)
# Schedule seq groups prompts. # Schedule seq groups prompts.
@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
assert get_sequence_groups(out) == [seq_group_a] assert get_sequence_groups(out) == [seq_group_a]
# Add a new prefill request B. # Add a new prefill request B.
_, seq_group_b = create_dummy_prompt("2", 30) _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
scheduler.add_seq_group(seq_group_b) scheduler.add_seq_group(seq_group_b)
# Verify prefill requests are prioritized. Since max_batched_num_tokens # Verify prefill requests are prioritized. Since max_batched_num_tokens
@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
assert get_sequence_groups(out) == [seq_group_b] assert get_sequence_groups(out) == [seq_group_b]
def test_scheduler_schedule_preempt_abort(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, 2, max_model_len) scheduler_config = SchedulerConfig(
64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2 cache_config.num_gpu_blocks = 2
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler. # Add seq groups to scheduler.
seq_a, seq_group_a = create_dummy_prompt("1", block_size) seq_a, seq_group_a = create_dummy_prompt("1",
seq_b, seq_group_b = create_dummy_prompt("2", block_size) block_size,
block_size=block_size)
seq_b, seq_group_b = create_dummy_prompt("2",
block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group_a) scheduler.add_seq_group(seq_group_a)
scheduler.add_seq_group(seq_group_b) scheduler.add_seq_group(seq_group_b)
@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
assert scheduler.get_num_unfinished_seq_groups() == 1 assert scheduler.get_num_unfinished_seq_groups() == 1
def test_scheduler_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_seq_group = 2 max_seq_group = 2
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len) scheduler_config = SchedulerConfig(
64,
max_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
all_seq_groups: List[SequenceGroup] = [] all_seq_groups: List[SequenceGroup] = []
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) _, seq_group = create_dummy_prompt(str(i),
prompt_length=block_size,
block_size=block_size)
all_seq_groups.append(seq_group) all_seq_groups.append(seq_group)
# Append 1 seq group # Append 1 seq group
@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
def test_scheduler_delay_factor(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_delay_factor(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5) scheduler_config = SchedulerConfig(
100,
64,
16,
delay_factor=0.5,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
# schedule first prompt # schedule first prompt
seq_group_meta, seq_group = create_dummy_prompt("0", seq_group_meta, seq_group = create_dummy_prompt("0",
prompt_length=block_size) prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert out.num_prefill_groups > 0 assert out.num_prefill_groups > 0
@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
# wait for a second before scheduling next prompt # wait for a second before scheduling next prompt
time.sleep(1) time.sleep(1)
seq_group_meta, seq_group = create_dummy_prompt("1", seq_group_meta, seq_group = create_dummy_prompt("1",
prompt_length=block_size) prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# second prompt should *not* be scheduled # second prompt should *not* be scheduled
@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
append_new_token(out, 1) append_new_token(out, 1)
def test_swapped_out_prioritized(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler(max_num_seqs=6) def test_swapped_out_prioritized(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(max_num_seqs=6,
block_size=block_size,
use_v2_block_manager=use_v2_block_manager,
num_cpu_blocks=64,
num_gpu_blocks=64)
# best_of=2 * 3 == 6 sequences. # best_of=2 * 3 == 6 sequences.
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# prefill scheduled now. # prefill scheduled now.
@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
append_new_token(out, 1) append_new_token(out, 1)
# Add 1 more task. Swap should be prioritized over prefill. # Add 1 more task. Swap should be prioritized over prefill.
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
append_new_token(out, 1) append_new_token(out, 1)
@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
def initialize_scheduler(*, def initialize_scheduler(
*,
max_num_seqs=1000, max_num_seqs=1000,
max_token_budget=1000, max_token_budget=1000,
max_model_len=1000, max_model_len=1000,
lora_config=None): lora_config=None,
block_size = 4 use_v2_block_manager=False,
scheduler_config = SchedulerConfig(max_token_budget, max_num_seqs, block_size=4,
max_model_len) num_cpu_blocks=8,
num_gpu_blocks=8,
):
block_size = block_size
scheduler_config = SchedulerConfig(
max_token_budget,
max_num_seqs,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = num_cpu_blocks
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = num_gpu_blocks
scheduler = Scheduler(scheduler_config, cache_config, lora_config) scheduler = Scheduler(scheduler_config, cache_config, lora_config)
return scheduler return scheduler
@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs) budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
def test_prefill_schedule_max_prompt_len(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
""" """
Test prompt longer than max_prompt_len is aborted. Test prompt longer than max_prompt_len is aborted.
""" """
scheduler = initialize_scheduler(max_model_len=30) block_size = 4
_, seq_group = create_dummy_prompt("0", prompt_length=60) scheduler = initialize_scheduler(max_model_len=30,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
_, seq_group = create_dummy_prompt("0",
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
budget = create_token_budget() budget = create_token_budget()
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_prefill_schedule_token_budget(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
""" """
Test token budget respected. Test token budget respected.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=0) budget = create_token_budget(token_budget=0)
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# 0 token budget == nothing is scheduled. # 0 token budget == nothing is scheduled.
@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
# Test when current_batched_tokens respected. # Test when current_batched_tokens respected.
scheduler = initialize_scheduler() scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=16,
num_gpu_blocks=16)
budget = create_token_budget(token_budget=60) budget = create_token_budget(token_budget=60)
add_token_budget(budget, 30, 0) add_token_budget(budget, 30, 0)
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
# Cannot schedule a prompt that doesn't fit the budget. # Cannot schedule a prompt that doesn't fit the budget.
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_prefill_schedule_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
""" """
Test max seq respected. Test max seq respected.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(max_num_seqs=2) budget = create_token_budget(max_num_seqs=2)
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
remaining_waiting = scheduler.waiting remaining_waiting = scheduler.waiting
@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
scheduler.waiting = deque() scheduler.waiting = deque()
budget = create_token_budget(max_num_seqs=2) budget = create_token_budget(max_num_seqs=2)
add_token_budget(budget, 0, 2) add_token_budget(budget, 0, 2)
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
remaining_waiting = scheduler.waiting remaining_waiting = scheduler.waiting
@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
def test_prefill_schedule_max_lora(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
""" """
Test max lora is respected and prioritized. Test max lora is respected and prioritized.
""" """
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config) scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=120) budget = create_token_budget(token_budget=120)
curr_loras: Set[int] = set() curr_loras: Set[int] = set()
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), _, seq_group = create_dummy_prompt(str(i),
prompt_length=60, prompt_length=60,
block_size=block_size,
lora_request=LoRARequest( lora_request=LoRARequest(
lora_name=str(i), lora_name=str(i),
lora_int_id=i + 1, lora_int_id=i + 1,
@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
# If a request is not scheduled because it hits max lora, it is # If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that. # prioritized. Verify that.
for i in range(2, 4): for i in range(2, 4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# Schedule 2 requests (0 and 2) # Schedule 2 requests (0 and 2)
output = scheduler._schedule_prefills(budget, curr_loras) output = scheduler._schedule_prefills(budget, curr_loras)
@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
assert budget.num_batched_tokens == 60 assert budget.num_batched_tokens == 60
def test_prefill_schedule_no_block_manager_capacity(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
""" """
Test sequence cannot be scheduled due to block manager has no capacity. Test sequence cannot be scheduled due to block manager has no capacity.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_gpu_blocks=128,
num_cpu_blocks=128)
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
scheduler.block_manager.can_allocate = MagicMock() scheduler.block_manager.can_allocate = MagicMock()
scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
scheduler = initialize_scheduler() scheduler = initialize_scheduler()
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
scheduler.block_manager.can_allocate = MagicMock() scheduler.block_manager.can_allocate = MagicMock()
scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_decode_schedule_preempted(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_schedule_preempted(use_v2_block_manager: bool):
""" """
Test decodes cannot be scheduled and preempted. Test decodes cannot be scheduled and preempted.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None curr_loras = None
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._add_seq_group_to_running(seq_group) scheduler._add_seq_group_to_running(seq_group)
@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
def test_decode_swap_beam_search(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_swap_beam_search(use_v2_block_manager: bool):
""" """
Test best_of > 1 swap out blocks Test best_of > 1 swap out blocks
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_gpu_blocks=64,
num_cpu_blocks=64)
curr_loras = None curr_loras = None
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
scheduler._add_seq_group_to_running(seq_group) scheduler._add_seq_group_to_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
def test_schedule_decode_blocks_to_copy_update(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
""" """
Verify blocks_to_copy is updated. Verify blocks_to_copy is updated.
""" """
scheduler = initialize_scheduler() block_size = 4
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=4,
num_cpu_blocks=16,
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
curr_loras = None curr_loras = None
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
@ -644,12 +772,17 @@ def test_schedule_swapped_simple():
assert blocks_to_swap_out == blocks_to_swap_in_reverse assert blocks_to_swap_out == blocks_to_swap_in_reverse
def test_schedule_swapped_max_token_budget(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
@ -676,12 +809,19 @@ def test_schedule_swapped_max_token_budget():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for i in range(4): for i in range(4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=4)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
@ -706,14 +846,21 @@ def test_schedule_swapped_max_seqs():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_max_loras(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config) scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras: Set[int] = set() curr_loras: Set[int] = set()
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), _, seq_group = create_dummy_prompt(str(i),
prompt_length=60, prompt_length=60,
block_size=block_size,
lora_request=LoRARequest( lora_request=LoRARequest(
lora_name=str(i), lora_name=str(i),
lora_int_id=i + 1, lora_int_id=i + 1,
@ -734,12 +881,20 @@ def test_schedule_swapped_max_loras():
assert len(curr_loras) == 1 assert len(curr_loras) == 1
def test_schedule_swapped_cannot_swap_in(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
@ -759,12 +914,20 @@ def test_schedule_swapped_cannot_swap_in():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_infeasible_swap(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_infeasible_swap(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
@ -785,10 +948,18 @@ def test_infeasible_swap():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_blocks_to_copy(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []