Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,23 @@ def schedule(self) -> SchedulerOutput:
assert num_new_tokens > 0

# Schedule encoder inputs.
encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
self._try_schedule_encoder_inputs(request,
request.num_computed_tokens,
num_new_tokens,
encoder_budget))
if num_new_tokens == 0:
# The request cannot be scheduled because the encoder budget
# or the encoder cache is exhausted.
# NOTE(woosuk): Here, by doing `continue` instead of `break`,
# we do not strictly follow the FCFS scheduling policy and
# allow the lower-priority requests to be scheduled.
req_index += 1
continue
if request.has_encoder_inputs:
(encoder_inputs_to_schedule, num_new_tokens,
new_encoder_budget) = self._try_schedule_encoder_inputs(
request, request.num_computed_tokens, num_new_tokens,
encoder_budget)
if num_new_tokens == 0:
# The request cannot be scheduled because the encoder budget
# or the encoder cache is exhausted.
# NOTE(woosuk): By using `continue` instead of `break` here,
# we intentionally relax the strict FCFS scheduling policy
# to allow lower-priority requests to be scheduled when a
# higher-priority request is blocked by encoder constraints.
req_index += 1
continue
else:
encoder_inputs_to_schedule = None
new_encoder_budget = encoder_budget

while True:
new_blocks = self.kv_cache_manager.allocate_slots(
Expand Down Expand Up @@ -318,13 +322,17 @@ def schedule(self) -> SchedulerOutput:
assert num_new_tokens > 0

# Schedule encoder inputs.
(encoder_inputs_to_schedule, num_new_tokens,
new_encoder_budget) = self._try_schedule_encoder_inputs(
request, num_computed_tokens, num_new_tokens,
encoder_budget)
if num_new_tokens == 0:
# The request cannot be scheduled.
break
if request.has_encoder_inputs:
(encoder_inputs_to_schedule, num_new_tokens,
new_encoder_budget) = self._try_schedule_encoder_inputs(
request, num_computed_tokens, num_new_tokens,
encoder_budget)
if num_new_tokens == 0:
# The request cannot be scheduled.
break
else:
encoder_inputs_to_schedule = None
new_encoder_budget = encoder_budget

new_blocks = self.kv_cache_manager.allocate_slots(
request, num_new_tokens, computed_blocks)
Expand Down Expand Up @@ -506,9 +514,6 @@ def _try_schedule_encoder_inputs(
limitations, the method adjusts `num_new_tokens` to schedule only the
decoder tokens up to just before the unschedulable encoder input.
"""
if not request.has_encoder_inputs():
return [], num_new_tokens, encoder_budget

encoder_inputs_to_schedule: list[int] = []
mm_positions = request.mm_positions
assert mm_positions is not None
Expand Down
9 changes: 2 additions & 7 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def __init__(
self.mm_positions = multi_modal_placeholders or []
self.mm_inputs = multi_modal_inputs or []
self.mm_hashes: list[str] = multi_modal_hashes or []
self.num_encoder_inputs = len(self.mm_inputs)
self.has_encoder_inputs = self.num_encoder_inputs > 0

# Sanity check
assert len(self.mm_inputs) == len(self.mm_positions)
Expand Down Expand Up @@ -117,13 +119,6 @@ def is_finished(self) -> bool:
def get_finished_reason(self) -> Union[FinishReason, None]:
return RequestStatus.get_finished_reason(self.status)

def has_encoder_inputs(self) -> bool:
return len(self.mm_inputs) > 0

@property
def num_encoder_inputs(self) -> int:
return len(self.mm_positions)

def get_num_encoder_tokens(self, input_id: int) -> int:
assert input_id < len(self.mm_positions)
num_tokens = self.mm_positions[input_id]["length"]
Expand Down