2020
2121logger = init_logger (__name__ )
2222
23+ # Used to trigger dummy requests whose outputs should be ignored.
24+ DUMMY_REQ_ID = "__DUMMY_REQ_ID"
25+
2326
2427class Scheduler :
2528
@@ -483,6 +486,7 @@ def update_from_output(
483486
484487 new_running : List [Request ] = []
485488 outputs : List [EngineCoreOutput ] = []
489+ finished_requests : List [str ] = []
486490
487491 # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
488492 # loop can be a performance bottleneck. We should do our best to avoid
@@ -564,17 +568,21 @@ def update_from_output(
564568 new_logprobs = logprobs .slice (req_index , req_index + 1 )
565569
566570 # Transmit partial if chunked prefill & prompt logprobs is enabled
567- if new_token_ids or prompt_logprobs_tensors is not None :
571+ if (new_token_ids or prompt_logprobs_tensors is not None ) \
572+ and req_id != DUMMY_REQ_ID :
568573 # Add EngineCoreOutput for this Request.
574+ finish_reason = request .get_finished_reason ()
569575 outputs .append (
570576 EngineCoreOutput (
571577 request_id = req_id ,
572578 new_token_ids = new_token_ids ,
573- finish_reason = request . get_finished_reason () ,
579+ finish_reason = finish_reason ,
574580 new_logprobs = new_logprobs ,
575581 new_prompt_logprobs_tensors = prompt_logprobs_tensors ,
576582 stop_reason = request .stop_reason ,
577583 events = request .take_events ()))
584+ if finish_reason :
585+ finished_requests .append (req_id )
578586
579587 self .scheduled_req_ids .remove (request .request_id )
580588 if not stopped :
@@ -583,6 +591,7 @@ def update_from_output(
583591 self .running = new_running
584592 return EngineCoreOutputs (
585593 outputs = outputs ,
594+ finished_requests = finished_requests ,
586595 scheduler_stats = self .make_stats (),
587596 )
588597
@@ -653,7 +662,7 @@ def get_num_unfinished_requests(self) -> int:
653662 return len (self .waiting ) + len (self .running )
654663
655664 def has_unfinished_requests (self ) -> bool :
656- return self .get_num_unfinished_requests ( ) > 0
665+ return len ( self .running ) > 0 or len ( self . waiting ) > 0
657666
658667 def get_num_unscheduled_requests (self ) -> int :
659668 """Number of requests that are not being processed by the executor."""
0 commit comments