Skip to content

Commit 342d17f

Browse files
baxingpiaochongyewentao256
authored andcommitted
[V1][Metrics] Add per-request TPOT histogram (#24015)
Signed-off-by: baxingpiaochong <771405853@qq.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
1 parent 3c62d28 commit 342d17f

File tree

2 files changed

+24
-1
lines changed

2 files changed

+24
-1
lines changed

vllm/v1/metrics/loggers.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,19 @@ def __init__(self,
411411
self.histogram_inter_token_latency = make_per_engine(
412412
histogram_inter_token_latency, engine_indexes, model_name)
413413

414+
histogram_request_time_per_output_token = self._histogram_cls(
415+
name="vllm:request_time_per_output_token_seconds",
416+
documentation=
417+
"Histogram of time_per_output_token_seconds per request.",
418+
buckets=[
419+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
420+
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
421+
],
422+
labelnames=labelnames)
423+
self.histogram_request_time_per_output_token = make_per_engine(
424+
histogram_request_time_per_output_token, engine_indexes,
425+
model_name)
426+
414427
request_latency_buckets = [
415428
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
416429
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@@ -583,6 +596,8 @@ def record(self,
583596
finished_request.num_prompt_tokens)
584597
self.histogram_num_generation_tokens_request[engine_idx].observe(
585598
finished_request.num_generation_tokens)
599+
self.histogram_request_time_per_output_token[engine_idx].observe(
600+
finished_request.mean_time_per_output_token)
586601
if finished_request.max_tokens_param:
587602
self.histogram_max_tokens_request[engine_idx].observe(
588603
finished_request.max_tokens_param)

vllm/v1/metrics/stats.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class FinishedRequestStats:
8686
prefill_time: float = 0.0
8787
inference_time: float = 0.0
8888
decode_time: float = 0.0
89+
mean_time_per_output_token: float = 0.0
8990

9091

9192
class IterationStats:
@@ -177,6 +178,12 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
177178
# Any preemptions during prefill or decode are included
178179
inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
179180

181+
# Do not count the token generated by the prefill phase
182+
mean_time_per_output_token = (decode_time /
183+
(req_stats.num_generation_tokens - 1)
184+
if req_stats.num_generation_tokens -
185+
1 > 0 else 0)
186+
180187
finished_req = \
181188
FinishedRequestStats(finish_reason=finish_reason,
182189
e2e_latency=e2e_latency,
@@ -186,7 +193,8 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
186193
queued_time=queued_time,
187194
prefill_time=prefill_time,
188195
inference_time=inference_time,
189-
decode_time=decode_time)
196+
decode_time=decode_time,
197+
mean_time_per_output_token=mean_time_per_output_token)
190198
self.finished_requests.append(finished_req)
191199

192200

0 commit comments

Comments
 (0)