ai-dynamo
diff --git a/‎components/backends/sglang/deploy/disagg-multinode.yaml‎
Lines changed: 2 additions & 2 deletions b/‎components/backends/sglang/deploy/disagg-multinode.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎components/backends/sglang/docs/multinode-examples.md‎
Lines changed: 9 additions & 0 deletions b/‎components/backends/sglang/docs/multinode-examples.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎components/backends/sglang/src/dynamo/sglang/args.py‎
Lines changed: 21 additions & 1 deletion b/‎components/backends/sglang/src/dynamo/sglang/args.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎components/backends/sglang/src/dynamo/sglang/main.py‎
Lines changed: 30 additions & 5 deletions b/‎components/backends/sglang/src/dynamo/sglang/main.py‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎components/backends/sglang/src/dynamo/sglang/register.py‎
Lines changed: 22 additions & 11 deletions b/‎components/backends/sglang/src/dynamo/sglang/register.py‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎components/backends/trtllm/src/dynamo/trtllm/main.py‎
Lines changed: 12 additions & 0 deletions b/‎components/backends/trtllm/src/dynamo/trtllm/main.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py‎
Lines changed: 23 additions & 0 deletions b/‎components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎deploy/cloud/operator/internal/dynamo/backend_trtllm.go‎
Lines changed: 1 addition & 1 deletion b/‎deploy/cloud/operator/internal/dynamo/backend_trtllm.go‎
Lines changed: 1 addition & 1 deletion
@@ -18,7 +18,7 @@ spec:
   services:
     Frontend:
       dynamoNamespace: sglang-disagg-multinode
-      componentType: main
+      componentType: frontend
       replicas: 1
       extraPodSpec:
         mainContainer:
@@ -54,7 +54,7 @@ spec:
       multinode:
         nodeCount: 2
       envFromSecret: hf-token-secret
-      dynamoNamespace: sglang-disagg
+      dynamoNamespace: sglang-disagg-multinode
       componentType: worker
       replicas: 1
       resources:
 
@@ -9,6 +9,15 @@ SPDX-License-Identifier: Apache-2.0
 
 SGLang allows you to deploy multi-node sized models by adding in the `dist-init-addr`, `nnodes`, and `node-rank` arguments. Below we demonstrate and example of deploying DeepSeek R1 for disaggregated serving across 4 nodes. This example requires 4 nodes of 8xH100 GPUs.
 
+**Prerequisite**: Building the Dynamo container.
+
+```bash
+cd $DYNAMO_ROOT
+docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache
+```
+
+You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=<tag>` to the build command.
+
 **Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script.
 ```bash
 ./utils/gen_env_vars.sh
 
@@ -10,7 +10,7 @@
 from argparse import Namespace
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 from sglang.srt.server_args import ServerArgs
 
@@ -39,6 +39,10 @@ class DynamoArgs:
     endpoint: str
     migration_limit: int
 
+    # tool and reasoning parser options
+    tool_call_parser: Optional[str] = None
+    reasoning_parser: Optional[str] = None
+
 
 class DisaggregationMode(Enum):
     AGGREGATED = "agg"
@@ -71,6 +75,20 @@ def parse_args(args: list[str]) -> Config:
         "--version", action="version", version=f"Dynamo Backend SGLang {__version__}"
     )
 
+    # To avoid name conflicts with different backends, adoped prefix "dyn-" for dynamo specific args
+    parser.add_argument(
+        "--dyn-tool-call-parser",
+        type=str,
+        default=None,
+        help="Tool call parser name for the model. Available options: 'hermes', 'nemotron_deci', 'llama3_json', 'mistral', 'phi4'.",
+    )
+    parser.add_argument(
+        "--dyn-reasoning-parser",
+        type=str,
+        default=None,
+        help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
+    )
+
     # Dynamo args
     for info in DYNAMO_ARGS.values():
         parser.add_argument(
@@ -123,6 +141,8 @@ def parse_args(args: list[str]) -> Config:
         component=parsed_component_name,
         endpoint=parsed_endpoint_name,
         migration_limit=parsed_args.migration_limit,
+        tool_call_parser=parsed_args.dyn_tool_call_parser,
+        reasoning_parser=parsed_args.dyn_reasoning_parser,
     )
     logging.debug(f"Dynamo args: {dynamo_args}")
 
 
@@ -81,18 +81,43 @@ async def init(runtime: DistributedRuntime, config: Config):
         logging.info(f"Setting up ZMQ kv event publisher at {zmq_ep}")
         kv_publisher = ZmqKvEventPublisher(component=component, config=zmq_config)
 
+    # Readiness gate: requests wait until model is registered
+    ready_event = asyncio.Event()
+
+    async def gated_generate(request):
+        """Queue requests until model registration completes"""
+        await ready_event.wait()  # Block until model is ready
+        async for response in handler.generate(request):
+            yield response
+
     handler = DecodeWorkerHandler(
         component, engine, config, publisher, kv_publisher, prefill_client
     )
 
-    await register_llm_with_runtime_config(
-        engine, generate_endpoint, server_args, dynamo_args.migration_limit
-    )
+    async def register_model():
+        """Register the model and signal readiness"""
+        registration_success = await register_llm_with_runtime_config(
+            engine,
+            generate_endpoint,
+            server_args,
+            dynamo_args,
+        )
+
+        if not registration_success:
+            logging.error("Model registration failed; shutting down")
+            runtime.shutdown()
+            raise RuntimeError("Model registration failed")
+
+        # Model is ready - allow queued requests to proceed
+        ready_event.set()
+        logging.info("Model registration succeeded; processing queued requests")
 
     try:
-        # TODO: add in native endpoints
+        # Start endpoint immediately and register model concurrently
+        # Requests queue until ready_event is set
         await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate, graceful_shutdown=False),
+            generate_endpoint.serve_endpoint(gated_generate, graceful_shutdown=False),
+            register_model(),
         )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")
 
@@ -9,38 +9,49 @@
 
 from dynamo._core import Endpoint
 from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
+from dynamo.sglang.args import DynamoArgs
 
 
 async def register_llm_with_runtime_config(
     engine: sgl.Engine,
     endpoint: Endpoint,
     server_args: ServerArgs,
-    migration_limit: int,
-):
-    """Register LLM with runtime config"""
-    runtime_config = await _get_runtime_config(engine)
+    dynamo_args: DynamoArgs,
+) -> bool:
+    """Register LLM with runtime config
+
+    Returns:
+        bool: True if registration succeeded, False if it failed
+    """
+    runtime_config = await _get_runtime_config(engine, dynamo_args)
     try:
         await register_llm(
             ModelType.Backend,
             endpoint,
             server_args.model_path,
             server_args.served_model_name,
             kv_cache_block_size=server_args.page_size,
-            migration_limit=migration_limit,
+            migration_limit=dynamo_args.migration_limit,
             runtime_config=runtime_config,
         )
+        logging.info("Successfully registered LLM with runtime config")
+        return True
     except Exception as e:
         logging.error(f"Failed to register with runtime config: {e}")
-        return None
+        return False
 
 
-async def _get_runtime_config(engine: sgl.Engine) -> Optional[ModelRuntimeConfig]:
+async def _get_runtime_config(
+    engine: sgl.Engine, dynamo_args: DynamoArgs
+) -> Optional[ModelRuntimeConfig]:
     """Get runtime config from SGLang engine"""
+    runtime_config = ModelRuntimeConfig()
+    # set reasoning parser and tool call parser
+    runtime_config.reasoning_parser = dynamo_args.reasoning_parser
+    runtime_config.tool_call_parser = dynamo_args.tool_call_parser
     try:
         # Try to check if the engine has a scheduler attribute with the computed values
         if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
-            runtime_config = ModelRuntimeConfig()
-
             # Get max_total_num_tokens from scheduler_info
             if "max_total_num_tokens" in engine.scheduler_info:
                 max_total_tokens = engine.scheduler_info["max_total_num_tokens"]
@@ -67,8 +78,8 @@ async def _get_runtime_config(engine: sgl.Engine) -> Optional[ModelRuntimeConfig
             "The engine may compute these values internally after initialization. "
             "Proceeding without runtime config - SGLang will use its internal defaults."
         )
-        return None
+        return runtime_config
 
     except Exception as e:
         logging.warning(f"Failed to get runtime config: {e}. Proceeding without it.")
-        return None
+        return runtime_config
@@ -228,6 +228,17 @@ async def init(runtime: DistributedRuntime, config: Config):
     async with get_llm_engine(engine_args) as engine:
         endpoint = component.endpoint(config.endpoint)
 
+        # should ideally call get_engine_runtime_config
+        # this is because we don't have a good way to
+        # get total_kv_blocks from the engine yet without calling get_stats_async
+        # This causes an issue because get_stats_async doesn't work when no requests are sent to the engine
+        # So for now, we just set the parsers from the config
+        # TODO: fix this once we have a better way to get total_kv_blocks
+        runtime_config = ModelRuntimeConfig()
+
+        runtime_config.reasoning_parser = config.reasoning_parser
+        runtime_config.tool_call_parser = config.tool_call_parser
+
         if is_first_worker(config):
             # Register the model with runtime config
             await register_llm(
@@ -237,6 +248,7 @@ async def init(runtime: DistributedRuntime, config: Config):
                 config.served_model_name,
                 kv_cache_block_size=config.kv_block_size,
                 migration_limit=config.migration_limit,
+                runtime_config=runtime_config,
             )
         # publisher will be set later if publishing is enabled.
         handler_config = RequestHandlerConfig(
 
@@ -49,6 +49,9 @@ def __init__(self) -> None:
         self.next_endpoint: str = ""
         self.modality: str = "text"
 
+        self.reasoning_parser: Optional[str] = None
+        self.tool_call_parser: Optional[str] = None
+
     def __str__(self) -> str:
         return (
             f"Config(namespace={self.namespace}, "
@@ -73,6 +76,8 @@ def __str__(self) -> str:
             f"disaggregation_strategy={self.disaggregation_strategy}, "
             f"next_endpoint={self.next_endpoint}, "
             f"modality={self.modality})"
+            f"reasoning_parser={self.reasoning_parser})"
+            f"tool_call_parser={self.tool_call_parser})"
         )
 
 
@@ -234,6 +239,21 @@ def cmd_line_args():
         default="",
         help=f"Endpoint(in 'dyn://namespace.component.endpoint' format) to send requests to when running in disaggregation mode. Default: {DEFAULT_NEXT_ENDPOINT} if first worker, empty if next worker",
     )
+
+    # To avoid name conflicts with different backends, adoped prefix "dyn-" for dynamo specific args
+    parser.add_argument(
+        "--dyn-tool-call-parser",
+        type=str,
+        default=None,
+        help="Tool call parser name for the model. Available options: 'hermes', 'nemotron_deci', 'llama3_json', 'mistral', 'phi4'.",
+    )
+    parser.add_argument(
+        "--dyn-reasoning-parser",
+        type=str,
+        default=None,
+        help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
+    )
+
     args = parser.parse_args()
 
     config = Config()
@@ -294,4 +314,7 @@ def cmd_line_args():
     config.publish_events_and_metrics = args.publish_events_and_metrics
     config.modality = args.modality
 
+    config.reasoning_parser = args.dyn_reasoning_parser
+    config.tool_call_parser = args.dyn_tool_call_parser
+
     return config
@@ -188,7 +188,7 @@ func getGPUsPerNode(resources *common.Resources) int32 {
 // getCommonTRTLLMEnvVars returns a map of common environment variables for TRTLLM deployments
 func getCommonTRTLLMEnvVars() map[string]bool {
 	return map[string]bool{
-		"CUDA_VISIBLE_DEVICES": true, "MODEL_PATH": true, "HF_TOKEN": true, "HUGGING_FACE_HUB_TOKEN": true,
+		"CUDA_VISIBLE_DEVICES": true, "MODEL_PATH": true, "HF_TOKEN": true, "HUGGING_FACE_HUB_TOKEN": true, "HF_ENDPOINT": true,
 		"TOKENIZERS_PARALLELISM": true, "NCCL_DEBUG": true, "NCCL_IB_DISABLE": true, "NCCL_P2P_DISABLE": true,
 		"TENSORRT_LLM_CACHE_DIR": true, "HF_HOME": true, "TRANSFORMERS_CACHE": true, "HF_DATASETS_CACHE": true,
 		"PATH": true, "LD_LIBRARY_PATH": true, "PYTHONPATH": true, "HOME": true, "USER": true,