Skip to content

Commit fdcf611

Browse files
authored
chore: Add Request Migration docs and minor enhancements (#2038)
1 parent bbe8dbb commit fdcf611

File tree

13 files changed

+164
-23
lines changed

13 files changed

+164
-23
lines changed

components/backends/llama_cpp/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,18 @@ Usage:
44
- `pip install -r requirements.txt` # Need a recent pip, `uv pip` might be too old.
55
- `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
66

7+
## Request Migration
8+
9+
In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend.
10+
11+
The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues.
12+
13+
For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend.
14+
15+
For example,
16+
```bash
17+
python3 -m dynamo.llama_cpp ... --migration-limit=3
18+
```
19+
indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend.
20+
21+
The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience.

components/backends/llama_cpp/src/dynamo/llama_cpp/main.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class Config:
2929
model_path: str
3030
model_name: Optional[str]
3131
context_length: int
32+
migration_limit: int
3233

3334

3435
@dynamo_worker(static=False)
@@ -40,7 +41,13 @@ async def worker(runtime: DistributedRuntime):
4041

4142
model_type = ModelType.Chat # llama.cpp does the pre-processing
4243
endpoint = component.endpoint(config.endpoint)
43-
await register_llm(model_type, endpoint, config.model_path, config.model_name)
44+
await register_llm(
45+
model_type,
46+
endpoint,
47+
config.model_path,
48+
config.model_name,
49+
migration_limit=config.migration_limit,
50+
)
4451

4552
# Initialize the engine
4653
# For more parameters see:
@@ -100,6 +107,12 @@ def cmd_line_args():
100107
default=None,
101108
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
102109
)
110+
parser.add_argument(
111+
"--migration-limit",
112+
type=int,
113+
default=0,
114+
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
115+
)
103116
args = parser.parse_args()
104117

105118
config = Config()
@@ -124,6 +137,7 @@ def cmd_line_args():
124137
config.component = parsed_component_name
125138
config.endpoint = parsed_endpoint_name
126139
config.context_length = args.context_length
140+
config.migration_limit = args.migration_limit
127141
return config
128142

129143

components/backends/sglang/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,22 @@ cd $DYNAMO_ROOT/components/backends/sglang
139139
./launch/disagg_dp_attn.sh
140140
```
141141

142+
## Request Migration
143+
144+
In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend.
145+
146+
The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues.
147+
148+
For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend.
149+
150+
For example,
151+
```bash
152+
python3 -m dynamo.sglang ... --migration-limit=3
153+
```
154+
indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend.
155+
156+
The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience.
157+
142158
## Advanced Examples
143159

144160
Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example!

components/backends/sglang/src/dynamo/sglang/worker/main.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,11 +311,23 @@ def signal_handler():
311311

312312
logging.info("Signal handlers set up for graceful shutdown")
313313

314-
server_args = parse_sglang_args_inc(sys.argv[1:])
315-
await init(runtime, server_args)
316-
317-
318-
async def init(runtime: DistributedRuntime, server_args: ServerArgs):
314+
# TODO: Better handle non-sglang args
315+
sys_argv = sys.argv[1:]
316+
migration_limit = 0
317+
try:
318+
idx = sys_argv.index("--migration-limit")
319+
migration_limit = int(sys_argv[idx + 1])
320+
del sys_argv[idx : idx + 2] # Remove the args from sys_argv
321+
except Exception:
322+
pass
323+
324+
server_args = parse_sglang_args_inc(sys_argv)
325+
await init(runtime, server_args, migration_limit)
326+
327+
328+
async def init(
329+
runtime: DistributedRuntime, server_args: ServerArgs, migration_limit: int
330+
):
319331
"""Initialize worker (either prefill or aggregated)"""
320332

321333
engine = sgl.Engine(server_args=server_args)
@@ -330,6 +342,7 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
330342
server_args.model_path,
331343
server_args.served_model_name,
332344
kv_cache_block_size=server_args.page_size,
345+
migration_limit=migration_limit,
333346
)
334347

335348
if server_args.disaggregation_mode != "null":

components/backends/trtllm/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,22 @@ DISAGGREGATION_STRATEGY="prefill_first" ./launch/disagg.sh
205205

206206
Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](./kv-cache-tranfer.md).
207207

208+
## Request Migration
209+
210+
In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend.
211+
212+
The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues.
213+
214+
For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend.
215+
216+
For example,
217+
```bash
218+
python3 -m dynamo.trtllm ... --migration-limit=3
219+
```
220+
indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend.
221+
222+
The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience.
223+
208224
## More Example Architectures
209225

210226
- [Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md)

components/backends/trtllm/src/dynamo/trtllm/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ async def init(runtime: DistributedRuntime, config: Config):
137137
config.model_path,
138138
config.served_model_name,
139139
kv_cache_block_size=config.kv_block_size,
140+
migration_limit=config.migration_limit,
140141
)
141142

142143
# publisher will be set later if publishing is enabled.

components/backends/trtllm/src/dynamo/trtllm/utils/trtllm_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(self) -> None:
2828
self.served_model_name: Optional[str] = None
2929
self.tensor_parallel_size: int = 1
3030
self.kv_block_size: int = 32
31+
self.migration_limit: int = 0
3132
self.extra_engine_args: str = ""
3233
self.publish_events_and_metrics: bool = False
3334
self.disaggregation_mode: DisaggregationMode = DEFAULT_DISAGGREGATION_MODE
@@ -46,6 +47,7 @@ def __str__(self) -> str:
4647
f"tensor_parallel_size={self.tensor_parallel_size}, "
4748
f"kv_block_size={self.kv_block_size}, "
4849
f"extra_engine_args={self.extra_engine_args}, "
50+
f"migration_limit={self.migration_limit}, "
4951
f"publish_events_and_metrics={self.publish_events_and_metrics}, "
5052
f"disaggregation_mode={self.disaggregation_mode}, "
5153
f"disaggregation_strategy={self.disaggregation_strategy}, "
@@ -113,6 +115,12 @@ def cmd_line_args():
113115
parser.add_argument(
114116
"--kv-block-size", type=int, default=32, help="Size of a KV cache block."
115117
)
118+
parser.add_argument(
119+
"--migration-limit",
120+
type=int,
121+
default=0,
122+
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
123+
)
116124

117125
parser.add_argument(
118126
"--extra-engine-args",
@@ -188,6 +196,7 @@ def cmd_line_args():
188196

189197
config.tensor_parallel_size = args.tensor_parallel_size
190198
config.kv_block_size = args.kv_block_size
199+
config.migration_limit = args.migration_limit
191200
config.extra_engine_args = args.extra_engine_args
192201
config.publish_events_and_metrics = args.publish_events_and_metrics
193202

components/backends/vllm/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,19 @@ vLLM workers are configured through command-line arguments. Key parameters inclu
186186
See `args.py` for the full list of configuration options and their defaults.
187187

188188
The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM.
189+
190+
## Request Migration
191+
192+
In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend.
193+
194+
The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues.
195+
196+
For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend.
197+
198+
For example,
199+
```bash
200+
python3 -m dynamo.vllm ... --migration-limit=3
201+
```
202+
indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend.
203+
204+
The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience.

components/backends/vllm/src/dynamo/vllm/args.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class Config:
3131
component: str
3232
endpoint: str
3333
is_prefill_worker: bool
34+
migration_limit: int = 0
3435
kv_port: Optional[int] = None
3536
side_channel_port: Optional[int] = None
3637

@@ -57,6 +58,12 @@ def parse_args() -> Config:
5758
action="store_true",
5859
help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
5960
)
61+
parser.add_argument(
62+
"--migration-limit",
63+
type=int,
64+
default=0,
65+
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
66+
)
6067

6168
parser = AsyncEngineArgs.add_cli_args(parser)
6269
args = parser.parse_args()
@@ -102,6 +109,7 @@ def parse_args() -> Config:
102109
config.endpoint = parsed_endpoint_name
103110
config.engine_args = engine_args
104111
config.is_prefill_worker = args.is_prefill_worker
112+
config.migration_limit = args.migration_limit
105113

106114
if config.engine_args.block_size is None:
107115
config.engine_args.block_size = 16

components/backends/vllm/src/dynamo/vllm/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ async def init(runtime: DistributedRuntime, config: Config):
148148
config.model,
149149
config.served_model_name,
150150
kv_cache_block_size=config.engine_args.block_size,
151+
migration_limit=config.migration_limit,
151152
)
152153

153154
factory = StatLoggerFactory(component, config.engine_args.data_parallel_rank or 0)

0 commit comments

Comments
 (0)