Skip to content

Commit 8eb4731

Browse files
committed
[V1][Metrics] Add e2e_latency histogram
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 022bcc7 commit 8eb4731

File tree

4 files changed

+24
-2
lines changed

4 files changed

+24
-2
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
218218
"vllm:time_per_output_token_seconds_sum",
219219
"vllm:time_per_output_token_seconds_bucket",
220220
"vllm:time_per_output_token_seconds_count",
221+
"vllm:e2e_request_latency_seconds_sum",
222+
"vllm:e2e_request_latency_seconds_bucket",
223+
"vllm:e2e_request_latency_seconds_count",
221224
]
222225

223226

vllm/v1/engine/output_processor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def __init__(
4040
self.is_prefilling = True
4141
self.queue = queue
4242

43-
self.stats = RequestStateStats(last_token_time=arrival_time)
43+
self.stats = RequestStateStats(arrival_time=arrival_time,
44+
last_token_time=arrival_time)
4445

4546
@classmethod
4647
def from_new_request(

vllm/v1/metrics/loggers.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,17 @@ def __init__(self, model_config: ModelConfig):
162162
],
163163
labelnames=labelnames).labels(*labelvalues)
164164

165+
request_latency_buckets = [
166+
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
167+
40.0, 50.0, 60.0
168+
]
169+
self.histogram_e2e_time_request = \
170+
prometheus_client.Histogram(
171+
name="vllm:e2e_request_latency_seconds",
172+
documentation="Histogram of e2e request latency in seconds.",
173+
buckets=request_latency_buckets,
174+
labelnames=labelnames).labels(*labelvalues)
175+
165176
def log(self, scheduler_stats: SchedulerStats,
166177
iteration_stats: IterationStats):
167178
"""Log to prometheus."""
@@ -176,6 +187,8 @@ def log(self, scheduler_stats: SchedulerStats,
176187

177188
for finished_request in iteration_stats.finished_requests:
178189
self.counter_request_success[finished_request.finish_reason].inc()
190+
self.histogram_e2e_time_request.observe(
191+
finished_request.e2e_latency)
179192
self.histogram_num_prompt_tokens_request.observe(
180193
finished_request.num_prompt_tokens)
181194
self.histogram_num_generation_tokens_request.observe(

vllm/v1/metrics/stats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class RequestStateStats:
2525
"""Stats that need to be tracked across delta updates."""
2626

2727
num_generation_tokens: int = 0
28+
arrival_time: float = 0.0
2829
last_token_time: float = 0.0
2930

3031

@@ -33,6 +34,7 @@ class FinishedRequestStats:
3334
"""Stats associated with a finished request."""
3435

3536
finish_reason: "FinishReason"
37+
e2e_latency: float = 0.0
3638
num_prompt_tokens: int = 0
3739
num_generation_tokens: int = 0
3840

@@ -77,7 +79,10 @@ def update_from_output(self, output: "EngineCoreOutput",
7779
def update_from_finished_request(self, finish_reason: "FinishReason",
7880
request_output: "RequestOutput",
7981
request_state_stats: RequestStateStats):
82+
now = time.time()
83+
e2e_latency = now - request_state_stats.arrival_time
84+
8085
self.finished_requests.append(
81-
FinishedRequestStats(finish_reason,
86+
FinishedRequestStats(finish_reason, e2e_latency,
8287
len(request_output.prompt_token_ids),
8388
request_state_stats.num_generation_tokens))

0 commit comments

Comments
 (0)