[V1][Scheduler] Avoid calling _try_schedule_encoder_inputs
for every request (#15778)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
70fedd0f79
commit
9b459eca88
@ -171,19 +171,23 @@ class Scheduler(SchedulerInterface):
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
|
||||
self._try_schedule_encoder_inputs(request,
|
||||
request.num_computed_tokens,
|
||||
num_new_tokens,
|
||||
encoder_budget))
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled because the encoder budget
|
||||
# or the encoder cache is exhausted.
|
||||
# NOTE(woosuk): Here, by doing `continue` instead of `break`,
|
||||
# we do not strictly follow the FCFS scheduling policy and
|
||||
# allow the lower-priority requests to be scheduled.
|
||||
req_index += 1
|
||||
continue
|
||||
if request.has_encoder_inputs:
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, request.num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled because the encoder budget
|
||||
# or the encoder cache is exhausted.
|
||||
# NOTE(woosuk): By using `continue` instead of `break` here,
|
||||
# we intentionally relax the strict FCFS scheduling policy
|
||||
# to allow lower-priority requests to be scheduled when a
|
||||
# higher-priority request is blocked by encoder constraints.
|
||||
req_index += 1
|
||||
continue
|
||||
else:
|
||||
encoder_inputs_to_schedule = None
|
||||
new_encoder_budget = encoder_budget
|
||||
|
||||
while True:
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
@ -318,13 +322,17 @@ class Scheduler(SchedulerInterface):
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
if request.has_encoder_inputs:
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
else:
|
||||
encoder_inputs_to_schedule = None
|
||||
new_encoder_budget = encoder_budget
|
||||
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request, num_new_tokens, computed_blocks)
|
||||
@ -506,9 +514,6 @@ class Scheduler(SchedulerInterface):
|
||||
limitations, the method adjusts `num_new_tokens` to schedule only the
|
||||
decoder tokens up to just before the unschedulable encoder input.
|
||||
"""
|
||||
if not request.has_encoder_inputs():
|
||||
return [], num_new_tokens, encoder_budget
|
||||
|
||||
encoder_inputs_to_schedule: list[int] = []
|
||||
mm_positions = request.mm_positions
|
||||
assert mm_positions is not None
|
||||
|
@ -59,6 +59,8 @@ class Request:
|
||||
self.mm_positions = multi_modal_placeholders or []
|
||||
self.mm_inputs = multi_modal_inputs or []
|
||||
self.mm_hashes: list[str] = multi_modal_hashes or []
|
||||
self.num_encoder_inputs = len(self.mm_inputs)
|
||||
self.has_encoder_inputs = self.num_encoder_inputs > 0
|
||||
|
||||
# Sanity check
|
||||
assert len(self.mm_inputs) == len(self.mm_positions)
|
||||
@ -117,13 +119,6 @@ class Request:
|
||||
def get_finished_reason(self) -> Union[FinishReason, None]:
|
||||
return RequestStatus.get_finished_reason(self.status)
|
||||
|
||||
def has_encoder_inputs(self) -> bool:
|
||||
return len(self.mm_inputs) > 0
|
||||
|
||||
@property
|
||||
def num_encoder_inputs(self) -> int:
|
||||
return len(self.mm_positions)
|
||||
|
||||
def get_num_encoder_tokens(self, input_id: int) -> int:
|
||||
assert input_id < len(self.mm_positions)
|
||||
num_tokens = self.mm_positions[input_id]["length"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user