[V1][Metrics] Add per-request TPOT histogram (#24015)

baxingpiaochong · yewentao256 · commit 342d17fb7f32 · 2025-10-03T13:35:54.000-07:00
Signed-off-by: baxingpiaochong &lt;771405853@qq.com&gt;
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -411,6 +411,19 @@ def __init__(self,
         self.histogram_inter_token_latency = make_per_engine(
             histogram_inter_token_latency, engine_indexes, model_name)
 
+        histogram_request_time_per_output_token = self._histogram_cls(
+            name="vllm:request_time_per_output_token_seconds",
+            documentation=
+            "Histogram of time_per_output_token_seconds per request.",
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ],
+            labelnames=labelnames)
+        self.histogram_request_time_per_output_token = make_per_engine(
+            histogram_request_time_per_output_token, engine_indexes,
+            model_name)
+
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
             40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@@ -583,6 +596,8 @@ def record(self,
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request[engine_idx].observe(
                 finished_request.num_generation_tokens)
+            self.histogram_request_time_per_output_token[engine_idx].observe(
+                finished_request.mean_time_per_output_token)
             if finished_request.max_tokens_param:
                 self.histogram_max_tokens_request[engine_idx].observe(
                     finished_request.max_tokens_param)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -86,6 +86,7 @@ class FinishedRequestStats:
     prefill_time: float = 0.0
     inference_time: float = 0.0
     decode_time: float = 0.0
+    mean_time_per_output_token: float = 0.0
 
 
 class IterationStats:
@@ -177,6 +178,12 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
         # Any preemptions during prefill or decode are included
         inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
 
+        # Do not count the token generated by the prefill phase
+        mean_time_per_output_token = (decode_time /
+                                      (req_stats.num_generation_tokens - 1)
+                                      if req_stats.num_generation_tokens -
+                                      1 > 0 else 0)
+
         finished_req = \
             FinishedRequestStats(finish_reason=finish_reason,
                                  e2e_latency=e2e_latency,
@@ -186,7 +193,8 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
                                  queued_time=queued_time,
                                  prefill_time=prefill_time,
                                  inference_time=inference_time,
-                                 decode_time=decode_time)
+                                 decode_time=decode_time,
+                                 mean_time_per_output_token=mean_time_per_output_token)
         self.finished_requests.append(finished_req)