[V1][Metrics] Deprecate vllm:model_forward/execute_time_milliseconds

markmc · markmc · commit 20a15cc67d77 · 2025-03-03T13:16:45.000-05:00
Metrics originally added by vllm-project#9659 These seem to be of questionable value relative to the existing prefill, decode, and inference time metrics. And since they would be challenging to implement in V1, and they don't conform to the standard of using seconds as units, let's deprecate them Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -190,18 +190,26 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
             labelnames=labelnames,
             buckets=request_latency_buckets)
+
+        # Deprecated in 0.8 - use prefill/decode/inference time metrics
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_model_forward_time_request = self._histogram_cls(
             name="vllm:model_forward_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model forward pass in ms.",
+            documentation=(
+                "Histogram of time spent in the model forward pass in ms. "
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
         self.histogram_model_execute_time_request = self._histogram_cls(
             name="vllm:model_execute_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model execute function in ms.",
+            documentation=(
+                "Histogram of time spent in the model execute function in ms."
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
+
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
             name="vllm:request_prompt_tokens",