fix: add missing max_num_seqs metrics for SGLang and TensorRT-LLM backends

keivenchang · keivenchang · commit a6884b20fea9 · 2025-09-24T16:13:43.000Z
SGLang: get max_num_seqs from server_args since SGLang separates config from runtime stats
TensorRT-LLM: populate max_num_seqs and max_num_batched_tokens from config for metrics consistency
diff --git a/components/backends/sglang/src/dynamo/sglang/register.py b/components/backends/sglang/src/dynamo/sglang/register.py
@@ -23,7 +23,7 @@ async def register_llm_with_runtime_config(
     Returns:
         bool: True if registration succeeded, False if it failed
     """
-    runtime_config = await _get_runtime_config(engine, dynamo_args)
+    runtime_config = await _get_runtime_config(engine, server_args, dynamo_args)
     input_type = ModelInput.Tokens
     output_type = ModelType.Chat | ModelType.Completions
     if not server_args.skip_tokenizer_init:
@@ -51,13 +51,25 @@ async def register_llm_with_runtime_config(
 
 
 async def _get_runtime_config(
-    engine: sgl.Engine, dynamo_args: DynamoArgs
+    engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoArgs
 ) -> Optional[ModelRuntimeConfig]:
     """Get runtime config from SGLang engine"""
     runtime_config = ModelRuntimeConfig()
     # set reasoning parser and tool call parser
     runtime_config.reasoning_parser = dynamo_args.reasoning_parser
     runtime_config.tool_call_parser = dynamo_args.tool_call_parser
+
+    # In SGLang, these are server_args, not scheduler_info (unlike vLLM)
+    # Note: If --max-running-requests is not specified, SGLang uses an internal default
+    # undocumented value. The value here will be None if not explicitly set by user.
+    max_running_requests = getattr(server_args, "max_running_requests", None)
+    if max_running_requests:
+        runtime_config.max_num_seqs = max_running_requests
+
+    max_prefill_tokens = getattr(server_args, "max_prefill_tokens", None)
+    if max_prefill_tokens:
+        runtime_config.max_num_batched_tokens = max_prefill_tokens
+
     try:
         # Try to check if the engine has a scheduler attribute with the computed values
         if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
@@ -77,7 +89,10 @@ async def _get_runtime_config(
                             f"(max_total_tokens={max_total_tokens}, page_size={page_size})"
                         )
 
-            # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info
+            # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
+            # SGLang separates configuration (server_args) from runtime stats (scheduler_info).
+            # In contrast, vLLM exposes both config and runtime values through engine config.
+            # These are config parameters, so they must be retrieved from server_args only.
 
             return runtime_config
 
diff --git a/components/backends/trtllm/src/dynamo/trtllm/main.py b/components/backends/trtllm/src/dynamo/trtllm/main.py
@@ -281,9 +281,29 @@ async def init(runtime: DistributedRuntime, config: Config):
         # TODO: fix this once we have a better way to get total_kv_blocks
         runtime_config = ModelRuntimeConfig()
 
+        # Set values from config that are available immediately
+        # Note: We populate max_num_seqs and max_num_batched_tokens from config
+        # to ensure Prometheus metrics are available even without engine stats
+
+        # Naming clarification:
+        # - In vLLM: max_num_seqs = maximum concurrent requests (this is an unusual name due to vLLM's historic reasons)
+        # - In TensorRT-LLM: max_batch_size = maximum concurrent requests (clearer name)
+        # Both parameters control the same thing: how many requests can be processed simultaneously
+        runtime_config.max_num_seqs = config.max_batch_size
+        runtime_config.max_num_batched_tokens = config.max_num_tokens
         runtime_config.reasoning_parser = config.reasoning_parser
         runtime_config.tool_call_parser = config.tool_call_parser
 
+        logging.info(f"Set runtime config max_num_seqs: {runtime_config.max_num_seqs}")
+        logging.info(
+            f"Set runtime config max_num_batched_tokens: {runtime_config.max_num_batched_tokens}"
+        )
+
+        # The get_engine_runtime_config function exists but is not called here due to:
+        # 1. get_stats_async requires active requests to work properly
+        # 2. We need runtime config during registration, before any requests are made
+        # 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation
+
         # publisher will be set later if publishing is enabled.
         handler_config = RequestHandlerConfig(
             component=component,