Skip to content

Commit 9d9794b

Browse files
committed
add torch.profile.record_function to show number of new/old requests
1 parent 428bc7b commit 9d9794b

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

vllm/v1/worker/gpu_worker.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,18 @@ def get_model(self) -> nn.Module:
508508
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
509509
return self.model_runner.get_supported_tasks()
510510

511+
def annotate_profile(self, scheduler_output):
512+
# add trace annotation so that we can easily distinguish new/cached request numbers in each iteration
513+
if not self.profiler:
514+
return nullcontext()
515+
516+
num_new = len(scheduler_output.scheduled_new_reqs)
517+
num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
518+
519+
return torch.profiler.record_function(
520+
f"execute_new_{num_new}_cached_{num_cached}"
521+
)
522+
511523
@torch.inference_mode()
512524
def sample_tokens(
513525
self, grammar_output: "GrammarOutput | None"
@@ -535,9 +547,10 @@ def execute_model(
535547
)
536548
)
537549

538-
output = self.model_runner.execute_model(scheduler_output, intermediate_tensors)
539-
if isinstance(output, (ModelRunnerOutput, NoneType)):
540-
return output
550+
with self.annotate_profile(scheduler_output):
551+
output = self.model_runner.execute_model(scheduler_output, intermediate_tensors)
552+
if isinstance(output, (ModelRunnerOutput, NoneType)):
553+
return output
541554

542555
assert isinstance(output, IntermediateTensors)
543556
parallel_config = self.vllm_config.parallel_config

0 commit comments

Comments
 (0)