diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 4c2f715820317..81c78bda3b505 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1027,16 +1027,21 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: # Update waiting requests. self.waiting.extendleft(running_scheduled.preempted) + # Update new running requests. - self.running.extend([s.seq_group for s in prefills.seq_groups]) - self.running.extend( - [s.seq_group for s in running_scheduled.decode_seq_groups]) - self.running.extend( - [s.seq_group for s in running_scheduled.prefill_seq_groups]) + # By default, vLLM scheduler prioritizes prefills. + # Once chunked prefill is enabled, + # the policy is changed to prioritize decode requests. self.running.extend( [s.seq_group for s in swapped_in.decode_seq_groups]) self.running.extend( [s.seq_group for s in swapped_in.prefill_seq_groups]) + self.running.extend( + [s.seq_group for s in running_scheduled.decode_seq_groups]) + self.running.extend( + [s.seq_group for s in running_scheduled.prefill_seq_groups]) + self.running.extend([s.seq_group for s in prefills.seq_groups]) + # Update swapped requests. self.swapped.extend(running_scheduled.swapped_out) return SchedulerOutputs(