Skip to content

Commit 6dfef71

Browse files
committed
[V1][Metrics] Add prefill/decode/inference time histograms
Prefill time: first token time - first scheduled time Decode time: finish time - first token time Inference time: finish time - first scheduled time Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 0dabfa4 commit 6dfef71

File tree

2 files changed

+46
-5
lines changed

2 files changed

+46
-5
lines changed

vllm/v1/metrics/loggers.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,27 @@ def __init__(self, model_config: ModelConfig):
179179
"Histogram of time spent in WAITING phase for request.",
180180
buckets=request_latency_buckets,
181181
labelnames=labelnames).labels(*labelvalues)
182+
self.histogram_inference_time_request = \
183+
prometheus_client.Histogram(
184+
name="vllm:request_inference_time_seconds",
185+
documentation=
186+
"Histogram of time spent in RUNNING phase for request.",
187+
buckets=request_latency_buckets,
188+
labelnames=labelnames).labels(*labelvalues)
189+
self.histogram_prefill_time_request = \
190+
prometheus_client.Histogram(
191+
name="vllm:request_prefill_time_seconds",
192+
documentation=
193+
"Histogram of time spent in PREFILL phase for request.",
194+
buckets=request_latency_buckets,
195+
labelnames=labelnames).labels(*labelvalues)
196+
self.histogram_decode_time_request = \
197+
prometheus_client.Histogram(
198+
name="vllm:request_decode_time_seconds",
199+
documentation=
200+
"Histogram of time spent in DECODE phase for request.",
201+
buckets=request_latency_buckets,
202+
labelnames=labelnames).labels(*labelvalues)
182203

183204
def log(self, scheduler_stats: SchedulerStats,
184205
iteration_stats: IterationStats):
@@ -196,6 +217,10 @@ def log(self, scheduler_stats: SchedulerStats,
196217
self.counter_request_success[finished_request.finish_reason].inc()
197218
self.histogram_e2e_time_request.observe(
198219
finished_request.e2e_latency)
220+
self.histogram_inference_time_request.observe(
221+
finished_request.inference_time)
222+
self.histogram_decode_time_request.observe(
223+
finished_request.decode_time)
199224
self.histogram_num_prompt_tokens_request.observe(
200225
finished_request.num_prompt_tokens)
201226
self.histogram_num_generation_tokens_request.observe(
@@ -207,6 +232,8 @@ def log(self, scheduler_stats: SchedulerStats,
207232
self.histogram_time_per_output_token.observe(tpot)
208233
for queue_time in iteration_stats.queue_times_iter:
209234
self.histogram_queue_time_request.observe(queue_time)
235+
for prefill_time in iteration_stats.prefill_times_iter:
236+
self.histogram_prefill_time_request.observe(prefill_time)
210237

211238
@staticmethod
212239
def _unregister_vllm_metrics():

vllm/v1/metrics/stats.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class RequestStateStats:
2929
num_generation_tokens: int = 0
3030
arrival_time: float = 0.0
3131
first_scheduled_time: float = 0.0
32+
first_token_time: float = 0.0
3233
last_token_time: float = 0.0
3334

3435

@@ -40,6 +41,8 @@ class FinishedRequestStats:
4041
e2e_latency: float = 0.0
4142
num_prompt_tokens: int = 0
4243
num_generation_tokens: int = 0
44+
inference_time: float = 0.0
45+
decode_time: float = 0.0
4346

4447

4548
class IterationStats:
@@ -53,6 +56,7 @@ def __init__(self, log_stats: bool):
5356
self.time_to_first_tokens_iter: List[float] = []
5457
self.time_per_output_tokens_iter: List[float] = []
5558
self.queue_times_iter: List[float] = []
59+
self.prefill_times_iter: List[float] = []
5660

5761
def update_from_output(self, output: "EngineCoreOutput",
5862
is_prefilling: bool, prompt_len: int,
@@ -72,8 +76,12 @@ def update_from_output(self, output: "EngineCoreOutput",
7276
# iff num_computed_tokens == num_tokens).
7377
assert (num_new_generation_tokens > 0)
7478
self.num_prompt_tokens += prompt_len
79+
self.first_token_time = now
7580

7681
self.time_to_first_tokens_iter.append(last_token_latency)
82+
83+
prefill_time = now - request_state_stats.first_scheduled_time
84+
self.prefill_times_iter.append(prefill_time)
7785
else:
7886
self.time_per_output_tokens_iter.append(last_token_latency)
7987

@@ -92,8 +100,14 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
92100
request_state_stats: RequestStateStats):
93101
now = time.time()
94102
e2e_latency = now - request_state_stats.arrival_time
95-
96-
self.finished_requests.append(
97-
FinishedRequestStats(finish_reason, e2e_latency,
98-
len(request_output.prompt_token_ids),
99-
request_state_stats.num_generation_tokens))
103+
inference_time = now - request_state_stats.first_scheduled_time
104+
decode_time = now - request_state_stats.first_token_time
105+
106+
finished_req = \
107+
FinishedRequestStats(finish_reason=finish_reason,
108+
e2e_latency=e2e_latency,
109+
num_prompt_tokens=len(request_output.prompt_token_ids),
110+
num_generation_tokens=request_state_stats.num_generation_tokens,
111+
inference_time=inference_time,
112+
decode_time=decode_time)
113+
self.finished_requests.append(finished_req)

0 commit comments

Comments
 (0)