[V1][Metrics] Add queue time histogram

markmc · markmc · commit 0dabfa423caa · 2025-02-05T06:39:01.000-05:00
The engine core process sends back a list of request IDs for newly
scheduled requests with its iteration outputs, and the frontend
computes queue_time relative to arrival_time.

We take this approach rather than having the scheduler compute
this interval to avoid comparing timestamps across processes
to avoid any possible future issues if these processes run on
different kernels.

Signed-off-by: Mark McLoughlin &lt;markmc@redhat.com&gt;
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -477,7 +477,7 @@ def update_from_output(
         self.running = new_running
         return EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(),
+            scheduler_stats=self.make_stats(scheduler_output),
         )
 
     def _check_stop(self, request: Request) -> bool:
@@ -548,11 +548,22 @@ def has_unfinished_requests(self) -> bool:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> SchedulerStats:
+    def make_stats(
+        self,
+        scheduler_output: Optional["SchedulerOutput"] = None
+    ) -> SchedulerStats:
+        if scheduler_output is not None and scheduler_output.scheduled_new_reqs:
+            new_req_ids = [
+                req_data.req_id
+                for req_data in scheduler_output.scheduled_new_reqs
+            ]
+        else:
+            new_req_ids = None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
+            new_req_ids=new_req_ids,
         )
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -246,6 +246,12 @@ async def _run_output_handler(self):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
+                # Record a timestamp for newly scheduled requests
+                iteration_stats = IterationStats(self.log_stats)
+                if outputs.scheduler_stats.new_req_ids is not None:
+                    self.output_processor.record_first_scheduled_time(
+                        outputs.scheduler_stats.new_req_ids, iteration_stats)
+
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                 # event loop for too long.
@@ -257,14 +263,12 @@ async def _run_output_handler(self):
                         outputs.outputs,
                         cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
 
-                iteration_stats = None
                 for i, outputs_slice in enumerate(slices):
                     # 2) Process EngineCoreOutputs.
                     processed_outputs = self.output_processor.process_outputs(
                         outputs_slice, iteration_stats)
                     # NOTE: RequestOutputs are pushed to their queues.
                     assert not processed_outputs.request_outputs
-                    iteration_stats = processed_outputs.iteration_stats
 
                     # Allow other asyncio tasks to run between chunks
                     if i + 1 < len(slices):
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -18,7 +18,6 @@ class OutputProcessorOutput:
 
     request_outputs: List[RequestOutput]
     reqs_to_abort: List[str]
-    iteration_stats: IterationStats
 
 
 class RequestState:
@@ -105,6 +104,15 @@ def add_request(
             request=request,
             queue=queue)
 
+    def record_first_scheduled_time(self, new_req_ids: List[str],
+                                    iteration_stats: IterationStats) -> None:
+        for req_id in new_req_ids:
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+            iteration_stats.update_from_newly_scheduled(req_state.stats)
+
     def process_outputs(
         self,
         engine_core_outputs: List[EngineCoreOutput],
@@ -141,8 +149,6 @@ def process_outputs(
 
         request_outputs: List[RequestOutput] = []
         reqs_to_abort: List[str] = []
-        if not iteration_stats:
-            iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
@@ -151,10 +157,11 @@ def process_outputs(
                 continue
 
             # 1) Compute stats for this iteration.
-            iteration_stats.update_from_output(engine_core_output,
-                                               req_state.is_prefilling,
-                                               req_state.prompt_len,
-                                               req_state.stats)
+            if iteration_stats is not None:
+                iteration_stats.update_from_output(engine_core_output,
+                                                   req_state.is_prefilling,
+                                                   req_state.prompt_len,
+                                                   req_state.stats)
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text.
@@ -184,14 +191,14 @@ def process_outputs(
                         reqs_to_abort.append(req_id)
 
                     # Track per-request stats
-                    iteration_stats.update_from_finished_request(
-                        detokenizer_output.finish_reason, request_output,
-                        req_state.stats)
+                    if iteration_stats is not None:
+                        iteration_stats.update_from_finished_request(
+                            detokenizer_output.finish_reason, request_output,
+                            req_state.stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
-            iteration_stats=iteration_stats,
         )
 
     @staticmethod
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -172,6 +172,13 @@ def __init__(self, model_config: ModelConfig):
                 documentation="Histogram of e2e request latency in seconds.",
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
+        self.histogram_queue_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_queue_time_seconds",
+                documentation=
+                "Histogram of time spent in WAITING phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
 
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
@@ -198,6 +205,8 @@ def log(self, scheduler_stats: SchedulerStats,
             self.histogram_time_to_first_token.observe(ttft)
         for tpot in iteration_stats.time_per_output_tokens_iter:
             self.histogram_time_per_output_token.observe(tpot)
+        for queue_time in iteration_stats.queue_times_iter:
+            self.histogram_queue_time_request.observe(queue_time)
 
     @staticmethod
     def _unregister_vllm_metrics():
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -2,7 +2,7 @@
 
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
@@ -19,13 +19,16 @@ class SchedulerStats:
     gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
 
+    new_req_ids: Optional[List[str]] = None
+
 
 @dataclass
 class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
 
     num_generation_tokens: int = 0
     arrival_time: float = 0.0
+    first_scheduled_time: float = 0.0
     last_token_time: float = 0.0
 
 
@@ -49,6 +52,7 @@ def __init__(self, log_stats: bool):
         self.finished_requests: List[FinishedRequestStats] = []
         self.time_to_first_tokens_iter: List[float] = []
         self.time_per_output_tokens_iter: List[float] = []
+        self.queue_times_iter: List[float] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
                            is_prefilling: bool, prompt_len: int,
@@ -76,6 +80,13 @@ def update_from_output(self, output: "EngineCoreOutput",
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
+    def update_from_newly_scheduled(self,
+                                    request_state_stats: RequestStateStats):
+        now = time.time()
+        request_state_stats.first_scheduled_time = now
+        queue_time = now - request_state_stats.arrival_time
+        self.queue_times_iter.append(queue_time)
+
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):