[Core] Scheduler perf fix (#4270)

2024-04-23 06:11:06 +09:00 · 2024-04-23 06:11:06 +09:00 · ad8d696a99
commit ad8d696a99
parent 3d925165f2
2 changed files with 11 additions and 14 deletions
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@ -540,7 +540,7 @@ def test_decode_schedule_preempted():
    curr_loras = None
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        running.append(seq_group)
    scheduler.block_manager.can_append_slots = MagicMock()
@ -581,7 +581,7 @@ def test_decode_swap_beam_search():
    budget = create_token_budget()
    for i in range(3):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        running.append(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        budget.add_num_seqs(seq_group.request_id,
@ -629,7 +629,7 @@ def test_schedule_decode_blocks_to_copy_update():
    running = deque()
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
    running.append(seq_group)
@ -659,7 +659,7 @@ def test_schedule_swapped_simple():
    curr_loras = None
    blocks_to_swap_out = {}
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
    scheduler._swap_out(seq_group, blocks_to_swap_out)
    swapped.append(seq_group)
@ -687,7 +687,7 @@ def test_schedule_swapped_max_token_budget():
    blocks_to_swap_out = {}
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        scheduler._swap_out(seq_group, blocks_to_swap_out)
        swapped.append(seq_group)
@ -721,7 +721,7 @@ def test_schedule_swapped_max_seqs():
    blocks_to_swap_out = {}
    for i in range(4):
        _, seq_group = create_dummy_prompt(str(i), prompt_length=60)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        scheduler._swap_out(seq_group, blocks_to_swap_out)
        swapped.append(seq_group)
@ -759,7 +759,7 @@ def test_schedule_swapped_max_loras():
                                               lora_name=str(i),
                                               lora_int_id=i + 1,
                                               lora_local_path="abc"))
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        scheduler._swap_out(seq_group, blocks_to_swap_out)
        swapped.append(seq_group)
@ -783,7 +783,7 @@ def test_schedule_swapped_cannot_swap_in():
    blocks_to_swap_out = {}
    for _ in range(2):
        _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group, 60)
+        scheduler._allocate_and_set_running(seq_group)
        append_new_token_seq_group(60, seq_group, 1)
        scheduler._swap_out(seq_group, blocks_to_swap_out)
        swapped.append(seq_group)
@ -808,7 +808,7 @@ def test_schedule_swapped_blocks_to_copy():
    policy = PolicyFactory.get_policy(policy_name="fcfs")
    curr_loras = None
    _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
-    scheduler._allocate_and_set_running(seq_group, 60)
+    scheduler._allocate_and_set_running(seq_group)
    append_new_token_seq_group(60, seq_group, 1)
    blocks_to_swap_out = {}
    scheduler._swap_out(seq_group, blocks_to_swap_out)
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -297,7 +297,6 @@ class Scheduler:
    def add_seq_group(self, seq_group: SequenceGroup) -> None:
        # Add sequence groups to the waiting queue.
        logger.debug(f"add_seq_group {seq_group.request_id}")
        self.waiting.append(seq_group)
    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
@ -427,7 +426,6 @@ class Scheduler:
                        swapped_out.append(seq_group)
                    break
            else:
                logger.debug(f"append slot for {seq_group}")
                self._append_slots(seq_group, blocks_to_copy)
                is_prefill = seq_group.is_prefill()
                if is_prefill:
@ -659,7 +657,7 @@ class Scheduler:
            if curr_loras is not None and lora_int_id > 0:
                curr_loras.add(lora_int_id)
            waiting_queue.popleft()
-            self._allocate_and_set_running(seq_group, num_new_tokens)
+            self._allocate_and_set_running(seq_group)
            seq_groups.append(
                ScheduledSequenceGroup(seq_group=seq_group,
                                       token_chunk_size=num_new_tokens))
@ -952,8 +950,7 @@ class Scheduler:
        self.running = deque(seq_group for seq_group in self.running
                             if not seq_group.is_finished())
-    def _allocate_and_set_running(self, seq_group: SequenceGroup,
+    def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
                                  num_new_tokens: int) -> None:
        self.block_manager.allocate(seq_group)
        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
            seq.status = SequenceStatus.RUNNING