Skip to content

Commit c5eeae2

Browse files
authored
Merge branch 'main' into ayushag/add-pythonic-parser
2 parents c89a7dd + 8064849 commit c5eeae2

File tree

25 files changed

+464
-275
lines changed

25 files changed

+464
-275
lines changed

components/backends/sglang/deploy/disagg-multinode.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ spec:
1818
services:
1919
Frontend:
2020
dynamoNamespace: sglang-disagg-multinode
21-
componentType: main
21+
componentType: frontend
2222
replicas: 1
2323
extraPodSpec:
2424
mainContainer:
@@ -54,7 +54,7 @@ spec:
5454
multinode:
5555
nodeCount: 2
5656
envFromSecret: hf-token-secret
57-
dynamoNamespace: sglang-disagg
57+
dynamoNamespace: sglang-disagg-multinode
5858
componentType: worker
5959
replicas: 1
6060
resources:

components/backends/sglang/docs/multinode-examples.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ SPDX-License-Identifier: Apache-2.0
99

1010
SGLang allows you to deploy multi-node sized models by adding in the `dist-init-addr`, `nnodes`, and `node-rank` arguments. Below we demonstrate and example of deploying DeepSeek R1 for disaggregated serving across 4 nodes. This example requires 4 nodes of 8xH100 GPUs.
1111

12+
**Prerequisite**: Building the Dynamo container.
13+
14+
```bash
15+
cd $DYNAMO_ROOT
16+
docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache
17+
```
18+
19+
You can use a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) by adding `--build-arg SGLANG_IMAGE_TAG=<tag>` to the build command.
20+
1221
**Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script.
1322
```bash
1423
./utils/gen_env_vars.sh

components/backends/sglang/src/dynamo/sglang/args.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from argparse import Namespace
1111
from dataclasses import dataclass
1212
from enum import Enum
13-
from typing import Any, Dict
13+
from typing import Any, Dict, Optional
1414

1515
from sglang.srt.server_args import ServerArgs
1616

@@ -39,6 +39,10 @@ class DynamoArgs:
3939
endpoint: str
4040
migration_limit: int
4141

42+
# tool and reasoning parser options
43+
tool_call_parser: Optional[str] = None
44+
reasoning_parser: Optional[str] = None
45+
4246

4347
class DisaggregationMode(Enum):
4448
AGGREGATED = "agg"
@@ -71,6 +75,20 @@ def parse_args(args: list[str]) -> Config:
7175
"--version", action="version", version=f"Dynamo Backend SGLang {__version__}"
7276
)
7377

78+
# To avoid name conflicts with different backends, adoped prefix "dyn-" for dynamo specific args
79+
parser.add_argument(
80+
"--dyn-tool-call-parser",
81+
type=str,
82+
default=None,
83+
help="Tool call parser name for the model. Available options: 'hermes', 'nemotron_deci', 'llama3_json', 'mistral', 'phi4'.",
84+
)
85+
parser.add_argument(
86+
"--dyn-reasoning-parser",
87+
type=str,
88+
default=None,
89+
help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
90+
)
91+
7492
# Dynamo args
7593
for info in DYNAMO_ARGS.values():
7694
parser.add_argument(
@@ -123,6 +141,8 @@ def parse_args(args: list[str]) -> Config:
123141
component=parsed_component_name,
124142
endpoint=parsed_endpoint_name,
125143
migration_limit=parsed_args.migration_limit,
144+
tool_call_parser=parsed_args.dyn_tool_call_parser,
145+
reasoning_parser=parsed_args.dyn_reasoning_parser,
126146
)
127147
logging.debug(f"Dynamo args: {dynamo_args}")
128148

components/backends/sglang/src/dynamo/sglang/main.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,43 @@ async def init(runtime: DistributedRuntime, config: Config):
8181
logging.info(f"Setting up ZMQ kv event publisher at {zmq_ep}")
8282
kv_publisher = ZmqKvEventPublisher(component=component, config=zmq_config)
8383

84+
# Readiness gate: requests wait until model is registered
85+
ready_event = asyncio.Event()
86+
87+
async def gated_generate(request):
88+
"""Queue requests until model registration completes"""
89+
await ready_event.wait() # Block until model is ready
90+
async for response in handler.generate(request):
91+
yield response
92+
8493
handler = DecodeWorkerHandler(
8594
component, engine, config, publisher, kv_publisher, prefill_client
8695
)
8796

88-
await register_llm_with_runtime_config(
89-
engine, generate_endpoint, server_args, dynamo_args.migration_limit
90-
)
97+
async def register_model():
98+
"""Register the model and signal readiness"""
99+
registration_success = await register_llm_with_runtime_config(
100+
engine,
101+
generate_endpoint,
102+
server_args,
103+
dynamo_args,
104+
)
105+
106+
if not registration_success:
107+
logging.error("Model registration failed; shutting down")
108+
runtime.shutdown()
109+
raise RuntimeError("Model registration failed")
110+
111+
# Model is ready - allow queued requests to proceed
112+
ready_event.set()
113+
logging.info("Model registration succeeded; processing queued requests")
91114

92115
try:
93-
# TODO: add in native endpoints
116+
# Start endpoint immediately and register model concurrently
117+
# Requests queue until ready_event is set
94118
await asyncio.gather(
95-
generate_endpoint.serve_endpoint(handler.generate, graceful_shutdown=False),
119+
generate_endpoint.serve_endpoint(gated_generate, graceful_shutdown=False),
120+
register_model(),
96121
)
97122
except Exception as e:
98123
logging.error(f"Failed to serve endpoints: {e}")

components/backends/sglang/src/dynamo/sglang/register.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,49 @@
99

1010
from dynamo._core import Endpoint
1111
from dynamo.llm import ModelRuntimeConfig, ModelType, register_llm
12+
from dynamo.sglang.args import DynamoArgs
1213

1314

1415
async def register_llm_with_runtime_config(
1516
engine: sgl.Engine,
1617
endpoint: Endpoint,
1718
server_args: ServerArgs,
18-
migration_limit: int,
19-
):
20-
"""Register LLM with runtime config"""
21-
runtime_config = await _get_runtime_config(engine)
19+
dynamo_args: DynamoArgs,
20+
) -> bool:
21+
"""Register LLM with runtime config
22+
23+
Returns:
24+
bool: True if registration succeeded, False if it failed
25+
"""
26+
runtime_config = await _get_runtime_config(engine, dynamo_args)
2227
try:
2328
await register_llm(
2429
ModelType.Backend,
2530
endpoint,
2631
server_args.model_path,
2732
server_args.served_model_name,
2833
kv_cache_block_size=server_args.page_size,
29-
migration_limit=migration_limit,
34+
migration_limit=dynamo_args.migration_limit,
3035
runtime_config=runtime_config,
3136
)
37+
logging.info("Successfully registered LLM with runtime config")
38+
return True
3239
except Exception as e:
3340
logging.error(f"Failed to register with runtime config: {e}")
34-
return None
41+
return False
3542

3643

37-
async def _get_runtime_config(engine: sgl.Engine) -> Optional[ModelRuntimeConfig]:
44+
async def _get_runtime_config(
45+
engine: sgl.Engine, dynamo_args: DynamoArgs
46+
) -> Optional[ModelRuntimeConfig]:
3847
"""Get runtime config from SGLang engine"""
48+
runtime_config = ModelRuntimeConfig()
49+
# set reasoning parser and tool call parser
50+
runtime_config.reasoning_parser = dynamo_args.reasoning_parser
51+
runtime_config.tool_call_parser = dynamo_args.tool_call_parser
3952
try:
4053
# Try to check if the engine has a scheduler attribute with the computed values
4154
if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
42-
runtime_config = ModelRuntimeConfig()
43-
4455
# Get max_total_num_tokens from scheduler_info
4556
if "max_total_num_tokens" in engine.scheduler_info:
4657
max_total_tokens = engine.scheduler_info["max_total_num_tokens"]
@@ -67,8 +78,8 @@ async def _get_runtime_config(engine: sgl.Engine) -> Optional[ModelRuntimeConfig
6778
"The engine may compute these values internally after initialization. "
6879
"Proceeding without runtime config - SGLang will use its internal defaults."
6980
)
70-
return None
81+
return runtime_config
7182

7283
except Exception as e:
7384
logging.warning(f"Failed to get runtime config: {e}. Proceeding without it.")
74-
return None
85+
return runtime_config

components/backends/trtllm/src/dynamo/trtllm/main.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,17 @@ async def init(runtime: DistributedRuntime, config: Config):
228228
async with get_llm_engine(engine_args) as engine:
229229
endpoint = component.endpoint(config.endpoint)
230230

231+
# should ideally call get_engine_runtime_config
232+
# this is because we don't have a good way to
233+
# get total_kv_blocks from the engine yet without calling get_stats_async
234+
# This causes an issue because get_stats_async doesn't work when no requests are sent to the engine
235+
# So for now, we just set the parsers from the config
236+
# TODO: fix this once we have a better way to get total_kv_blocks
237+
runtime_config = ModelRuntimeConfig()
238+
239+
runtime_config.reasoning_parser = config.reasoning_parser
240+
runtime_config.tool_call_parser = config.tool_call_parser
241+
231242
if is_first_worker(config):
232243
# Register the model with runtime config
233244
await register_llm(
@@ -237,6 +248,7 @@ async def init(runtime: DistributedRuntime, config: Config):
237248
config.served_model_name,
238249
kv_cache_block_size=config.kv_block_size,
239250
migration_limit=config.migration_limit,
251+
runtime_config=runtime_config,
240252
)
241253
# publisher will be set later if publishing is enabled.
242254
handler_config = RequestHandlerConfig(

components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def __init__(self) -> None:
4949
self.next_endpoint: str = ""
5050
self.modality: str = "text"
5151

52+
self.reasoning_parser: Optional[str] = None
53+
self.tool_call_parser: Optional[str] = None
54+
5255
def __str__(self) -> str:
5356
return (
5457
f"Config(namespace={self.namespace}, "
@@ -73,6 +76,8 @@ def __str__(self) -> str:
7376
f"disaggregation_strategy={self.disaggregation_strategy}, "
7477
f"next_endpoint={self.next_endpoint}, "
7578
f"modality={self.modality})"
79+
f"reasoning_parser={self.reasoning_parser})"
80+
f"tool_call_parser={self.tool_call_parser})"
7681
)
7782

7883

@@ -234,6 +239,21 @@ def cmd_line_args():
234239
default="",
235240
help=f"Endpoint(in 'dyn://namespace.component.endpoint' format) to send requests to when running in disaggregation mode. Default: {DEFAULT_NEXT_ENDPOINT} if first worker, empty if next worker",
236241
)
242+
243+
# To avoid name conflicts with different backends, adoped prefix "dyn-" for dynamo specific args
244+
parser.add_argument(
245+
"--dyn-tool-call-parser",
246+
type=str,
247+
default=None,
248+
help="Tool call parser name for the model. Available options: 'hermes', 'nemotron_deci', 'llama3_json', 'mistral', 'phi4'.",
249+
)
250+
parser.add_argument(
251+
"--dyn-reasoning-parser",
252+
type=str,
253+
default=None,
254+
help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
255+
)
256+
237257
args = parser.parse_args()
238258

239259
config = Config()
@@ -294,4 +314,7 @@ def cmd_line_args():
294314
config.publish_events_and_metrics = args.publish_events_and_metrics
295315
config.modality = args.modality
296316

317+
config.reasoning_parser = args.dyn_reasoning_parser
318+
config.tool_call_parser = args.dyn_tool_call_parser
319+
297320
return config

deploy/cloud/operator/internal/dynamo/backend_trtllm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ func getGPUsPerNode(resources *common.Resources) int32 {
188188
// getCommonTRTLLMEnvVars returns a map of common environment variables for TRTLLM deployments
189189
func getCommonTRTLLMEnvVars() map[string]bool {
190190
return map[string]bool{
191-
"CUDA_VISIBLE_DEVICES": true, "MODEL_PATH": true, "HF_TOKEN": true, "HUGGING_FACE_HUB_TOKEN": true,
191+
"CUDA_VISIBLE_DEVICES": true, "MODEL_PATH": true, "HF_TOKEN": true, "HUGGING_FACE_HUB_TOKEN": true, "HF_ENDPOINT": true,
192192
"TOKENIZERS_PARALLELISM": true, "NCCL_DEBUG": true, "NCCL_IB_DISABLE": true, "NCCL_P2P_DISABLE": true,
193193
"TENSORRT_LLM_CACHE_DIR": true, "HF_HOME": true, "TRANSFORMERS_CACHE": true, "HF_DATASETS_CACHE": true,
194194
"PATH": true, "LD_LIBRARY_PATH": true, "PYTHONPATH": true, "HOME": true, "USER": true,

0 commit comments

Comments
 (0)