feat: Metrics labels for multimodal. (#2835)

tzulingk · dillon-cullinan · commit f2dee408a049 · 2025-09-05T12:40:30.000-04:00
Signed-off-by: tzulingk@nvidia.com &lt;tzulingk@nvidia.com&gt;
diff --git a/examples/multimodal/components/encode_worker.py b/examples/multimodal/components/encode_worker.py
@@ -261,7 +261,9 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
 
     try:
         await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate),
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
         )
     except Exception as e:
         logger.error(f"Failed to serve endpoints: {e}")
diff --git a/examples/multimodal/components/processor.py b/examples/multimodal/components/processor.py
@@ -332,7 +332,9 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
 
     try:
         await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate),
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
         )
     except Exception as e:
         logger.error(f"Failed to serve endpoints: {e}")
diff --git a/examples/multimodal/components/publisher.py b/examples/multimodal/components/publisher.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import List, Optional, Tuple
 
 from vllm.config import VllmConfig
 from vllm.v1.metrics.loggers import StatLoggerBase
@@ -48,9 +48,15 @@ def log_engine_initialized(self):
 class DynamoStatLoggerPublisher(StatLoggerBase):
     """Stat logger publisher. Wrapper for the WorkerMetricsPublisher to match the StatLoggerBase interface."""
 
-    def __init__(self, component: Component, dp_rank: int) -> None:
+    def __init__(
+        self,
+        component: Component,
+        dp_rank: int,
+        metrics_labels: Optional[List[Tuple[str, str]]] = None,
+    ) -> None:
         self.inner = WorkerMetricsPublisher()
-        self.inner.create_endpoint(component)
+        metrics_labels = metrics_labels or []
+        self.inner.create_endpoint(component, metrics_labels)
         self.dp_rank = dp_rank
         self.num_gpu_block = 1
         self.request_total_slots = 1
@@ -141,15 +147,23 @@ def log_engine_initialized(self) -> None:
 class StatLoggerFactory:
     """Factory for creating stat logger publishers. Required by vLLM."""
 
-    def __init__(self, component: Component, dp_rank: int = 0) -> None:
+    def __init__(
+        self,
+        component: Component,
+        dp_rank: int = 0,
+        metrics_labels: Optional[List[Tuple[str, str]]] = None,
+    ) -> None:
         self.component = component
         self.created_logger: Optional[DynamoStatLoggerPublisher] = None
         self.dp_rank = dp_rank
+        self.metrics_labels = metrics_labels or []
 
     def create_stat_logger(self, dp_rank: int) -> StatLoggerBase:
         if self.dp_rank != dp_rank:
             return NullStatLogger()
-        logger = DynamoStatLoggerPublisher(self.component, dp_rank)
+        logger = DynamoStatLoggerPublisher(
+            self.component, dp_rank, metrics_labels=self.metrics_labels
+        )
         self.created_logger = logger
 
         return logger
diff --git a/examples/multimodal/components/video_encode_worker.py b/examples/multimodal/components/video_encode_worker.py
@@ -308,7 +308,9 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
 
     try:
         await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate),
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
         )
     except Exception as e:
         logger.error(f"Failed to serve endpoints: {e}")
diff --git a/examples/multimodal/components/worker.py b/examples/multimodal/components/worker.py
@@ -25,7 +25,6 @@
 import torch
 import uvloop
 from vllm.distributed.kv_events import ZmqEventPublisher
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
@@ -107,14 +106,15 @@ def endpoint_overwrite(args):
     def __init__(
         self,
         args: argparse.Namespace,
-        engine_args: AsyncEngineArgs,
         component: Component,
         endpoint: Endpoint,
+        config: Config,
     ):
         self.enable_disagg = args.enable_disagg
         self.endpoint = args.endpoint
         self.downstream_endpoint = args.downstream_endpoint
-        self.engine_args = engine_args
+        self.engine_args = config.engine_args
+        self.config = config
         self.setup_vllm_engine(component, endpoint)
 
     async def async_init(self, runtime: DistributedRuntime):
@@ -142,6 +142,7 @@ def setup_vllm_engine(self, component: Component, endpoint: Endpoint):
         self.stats_logger = StatLoggerFactory(
             component,
             self.engine_args.data_parallel_rank or 0,
+            metrics_labels=[("model", self.config.model)],
         )
         self.engine_client = AsyncLLM.from_vllm_config(
             vllm_config=vllm_config,
@@ -444,20 +445,24 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
 
     if args.worker_type in ["prefill", "encode_prefill"]:
         handler: VllmBaseWorker = VllmPDWorker(
-            args, config.engine_args, component, generate_endpoint
+            args, component, generate_endpoint, config
         )
     elif args.worker_type == "decode":
-        handler = VllmDecodeWorker(
-            args, config.engine_args, component, generate_endpoint
-        )
+        handler = VllmDecodeWorker(args, component, generate_endpoint, config)
     await handler.async_init(runtime)
 
     logger.info(f"Starting to serve the {args.endpoint} endpoint...")
 
+    metrics_labels = [("model", config.model)]
+
     try:
         await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate),
-            clear_endpoint.serve_endpoint(handler.clear_kv_blocks),
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=metrics_labels
+            ),
+            clear_endpoint.serve_endpoint(
+                handler.clear_kv_blocks, metrics_labels=metrics_labels
+            ),
         )
     except Exception as e:
         logger.error(f"Failed to serve endpoints: {e}")