File tree Expand file tree Collapse file tree 1 file changed +16
-3
lines changed Expand file tree Collapse file tree 1 file changed +16
-3
lines changed Original file line number Diff line number Diff line change @@ -508,6 +508,18 @@ def get_model(self) -> nn.Module:
508508 def get_supported_tasks (self ) -> tuple [SupportedTask , ...]:
509509 return self .model_runner .get_supported_tasks ()
510510
511+ def annotate_profile (self , scheduler_output ):
512+ # add trace annotation so that we can easily distinguish new/cached request numbers in each iteration
513+ if not self .profiler :
514+ return nullcontext ()
515+
516+ num_new = len (scheduler_output .scheduled_new_reqs )
517+ num_cached = len (scheduler_output .scheduled_cached_reqs .req_ids )
518+
519+ return torch .profiler .record_function (
520+ f"execute_new_{ num_new } _cached_{ num_cached } "
521+ )
522+
511523 @torch .inference_mode ()
512524 def sample_tokens (
513525 self , grammar_output : "GrammarOutput | None"
@@ -535,9 +547,10 @@ def execute_model(
535547 )
536548 )
537549
538- output = self .model_runner .execute_model (scheduler_output , intermediate_tensors )
539- if isinstance (output , (ModelRunnerOutput , NoneType )):
540- return output
550+ with self .annotate_profile (scheduler_output ):
551+ output = self .model_runner .execute_model (scheduler_output , intermediate_tensors )
552+ if isinstance (output , (ModelRunnerOutput , NoneType )):
553+ return output
541554
542555 assert isinstance (output , IntermediateTensors )
543556 parallel_config = self .vllm_config .parallel_config
You can’t perform that action at this time.
0 commit comments