Skip to content

Commit 6a1a801

Browse files
authored
feat: Support static workers, run without etcd. (#2281)
1 parent 0802ecd commit 6a1a801

File tree

16 files changed

+523
-189
lines changed

16 files changed

+523
-189
lines changed

components/frontend/src/dynamo/frontend/main.py

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,19 @@
88
# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
99
# - Pre-processor: Prompt templating and tokenization.
1010
# - Router, defaulting to round-robin (TODO: Add flags to enable KV routing).
11+
#
12+
# Pass `--interactive` or `-i` for text chat instead of HTTP server.
13+
#
14+
# For static mode (no etcd auto-discovery):
15+
# - python -m dynamo.frontend --model-name Qwen3-0.6B-Q8_0.gguf --model-path ~/llms/Qwen3-0.6B --static-endpoint dynamo.backend.generate
16+
# Worker example:
17+
# - cd lib/bindings/python/examples/hello_world
18+
# - python server_sglang_static.py
1119

1220
import argparse
1321
import asyncio
22+
import os
23+
import re
1424

1525
import uvloop
1626

@@ -26,6 +36,36 @@
2636
from dynamo.runtime import DistributedRuntime
2737

2838

39+
def validate_static_endpoint(value):
40+
"""Validate that static-endpoint is three words separated by dots."""
41+
if not re.match(
42+
r"^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$",
43+
value,
44+
):
45+
raise argparse.ArgumentTypeError(
46+
f"static-endpoint must be three words separated by dots, got: {value}"
47+
)
48+
return value
49+
50+
51+
def validate_model_name(value):
52+
"""Validate that model-name is a non-empty string."""
53+
if not value or not isinstance(value, str) or len(value.strip()) == 0:
54+
raise argparse.ArgumentTypeError(
55+
f"model-name must be a non-empty string, got: {value}"
56+
)
57+
return value.strip()
58+
59+
60+
def validate_model_path(value):
61+
"""Validate that model-path is a valid directory on disk."""
62+
if not os.path.isdir(value):
63+
raise argparse.ArgumentTypeError(
64+
f"model-path must be a valid directory on disk, got: {value}"
65+
)
66+
return value
67+
68+
2969
def parse_args():
3070
parser = argparse.ArgumentParser(
3171
description="Dynamo Frontend: HTTP+Pre-processor+Router",
@@ -72,13 +112,35 @@ def parse_args():
72112
help=" KV Router. Disable KV events.",
73113
)
74114
parser.set_defaults(use_kv_events=True)
115+
parser.add_argument(
116+
"--static-endpoint",
117+
type=validate_static_endpoint,
118+
help="Static endpoint in format: word.word.word (e.g., dynamo.backend.generate)",
119+
)
120+
parser.add_argument(
121+
"--model-name",
122+
type=validate_model_name,
123+
help="Model name as a string (e.g., 'Llama-3.2-1B-Instruct')",
124+
)
125+
parser.add_argument(
126+
"--model-path",
127+
type=validate_model_path,
128+
help="Path to model directory on disk (e.g., /tmp/model_cache/lama3.2_1B/)",
129+
)
75130

76-
return parser.parse_args()
131+
flags = parser.parse_args()
132+
133+
if flags.static_endpoint and (not flags.model_name or not flags.model_path):
134+
parser.error("--static-endpoint requires both --model-name and --model-path")
135+
136+
return flags
77137

78138

79139
async def async_main():
80-
runtime = DistributedRuntime(asyncio.get_running_loop(), False)
81140
flags = parse_args()
141+
is_static = bool(flags.static_endpoint) # true if the string has a value
142+
143+
runtime = DistributedRuntime(asyncio.get_running_loop(), is_static)
82144

83145
if flags.router_mode == "kv":
84146
router_mode = RouterMode.KV
@@ -100,8 +162,20 @@ async def async_main():
100162
"router_config": RouterConfig(router_mode, kv_router_config),
101163
}
102164

103-
# out=dyn
104-
e = EntrypointArgs(EngineType.Dynamic, **kwargs)
165+
if flags.static_endpoint:
166+
kwargs["endpoint_id"] = flags.static_endpoint
167+
if flags.model_name:
168+
kwargs["model_name"] = flags.model_name
169+
if flags.model_path:
170+
kwargs["model_path"] = flags.model_path
171+
172+
if is_static:
173+
# out=dyn://<static_endpoint>
174+
engine_type = EngineType.Static
175+
else:
176+
# out=auto, most common
177+
engine_type = EngineType.Dynamic
178+
e = EntrypointArgs(engine_type, **kwargs)
105179
engine = await make_engine(runtime, e)
106180

107181
try:

docs/guides/dynamo_run.md

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstrea
9393
**Node 1:** OpenAI compliant HTTP server, optional pre-processing, worker discovery:
9494

9595
```
96-
dynamo-run in=http out=dyn
96+
dynamo-run in=http out=auto
9797
```
9898

9999
**Node 2:** Vllm engine. Receives and returns requests over the network:
@@ -141,7 +141,18 @@ Example 4: Multiple component in a pipeline.
141141

142142
In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.generate` (possibly multiple instances of this) and `deepseek-distill-llama8b.decode.generate`.
143143

144-
For output it is always only `out=dyn`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). The old syntax of `dyn://...` is still accepted for backwards compatibility.
144+
For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). The exception is static workers, see that section.
145+
146+
### Static workers without etcd
147+
148+
Normally in the distributed system the frontend uses etcd to discover workers. The option exists to have a static endpoint without etcd.
149+
150+
```
151+
Node 1: dynamo-run in=http out=dyn://dynamo.backend.generate --model-name Qwen3-0.6B-Q8_0.gguf --model-path ~/llms/Qwen3-0.6B
152+
Node 2: dynamo-run in=dyn://dynamo.backend.generate out=llamacpp ~/llms/Qwen3-0.6B-Q8_0.gguf --static-worker --context-length 4096
153+
```
154+
155+
Note how `out=` points to a single endpoint, which must match the worker. The model's name and config (to do pre-processing) are usually discovered by the frontend via etcd. Now we must pass them in (`--model-name` and `--model-path`).
145156

146157
### KV-aware routing
147158

@@ -194,7 +205,7 @@ dynamo-run in=dyn://dynamo.endpoint.generate out=vllm /data/llms/Qwen/Qwen3-4B
194205
**Start the ingress node**
195206
196207
```
197-
dynamo-run in=http out=dyn --router-mode kv
208+
dynamo-run in=http out=auto --router-mode kv
198209
```
199210
200211
The only difference from the distributed system above is `--router-mode kv`. The patched vllm announces when a KV block is created or removed. The Dynamo router run finds the worker with the best match for those KV blocks and directs the traffic to that node.
@@ -569,7 +580,7 @@ And below are arguments that are mocker-specific:
569580
```bash
570581
echo '{"speedup_ratio": 10.0}' > mocker_args.json
571582
dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
572-
dynamo-run in=http out=dyn --router-mode kv
583+
dynamo-run in=http out=auto --router-mode kv
573584
```
574585

575586
### Extra engine arguments

launch/dynamo-run/src/flags.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use std::collections::HashMap;
1717
use std::path::PathBuf;
1818

1919
use clap::ValueEnum;
20+
use dynamo_llm::entrypoint::input::Input;
2021
use dynamo_llm::entrypoint::RouterConfig;
2122
use dynamo_llm::kv_router::KvRouterConfig;
2223
use dynamo_llm::local_model::LocalModel;
@@ -127,6 +128,12 @@ pub struct Flags {
127128
#[arg(long, value_parser = clap::value_parser!(u32).range(0..1024))]
128129
pub migration_limit: Option<u32>,
129130

131+
/// Make this a static worker.
132+
/// Do not connect to or advertise self on etcd.
133+
/// in=dyn://x.y.z only
134+
#[arg(long, default_value = "false")]
135+
pub static_worker: bool,
136+
130137
/// Everything after a `--`.
131138
/// These are the command line arguments to the python engine when using `pystr` or `pytok`.
132139
#[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
@@ -136,9 +143,23 @@ pub struct Flags {
136143
impl Flags {
137144
/// For each Output variant, check if it would be able to run.
138145
/// This takes validation out of the main engine creation path.
139-
pub fn validate(&self, local_model: &LocalModel, out_opt: &Output) -> anyhow::Result<()> {
146+
pub fn validate(
147+
&self,
148+
local_model: &LocalModel,
149+
in_opt: &Input,
150+
out_opt: &Output,
151+
) -> anyhow::Result<()> {
152+
match in_opt {
153+
Input::Endpoint(_) => {}
154+
_ => {
155+
if self.static_worker {
156+
anyhow::bail!("'--static-worker true' only applies to in=dyn://x.y.z");
157+
}
158+
}
159+
}
160+
140161
match out_opt {
141-
Output::Dynamic => {
162+
Output::Auto => {
142163
if self.context_length.is_some() {
143164
anyhow::bail!("'--context-length' flag should only be used on the worker node, not on the ingress");
144165
}
@@ -149,6 +170,19 @@ impl Flags {
149170
anyhow::bail!("'--migration-limit' flag should only be used on the worker node, not on the ingress");
150171
}
151172
}
173+
Output::Static(_) => {
174+
if self.model_name.is_none()
175+
|| self
176+
.model_path_pos
177+
.as_ref()
178+
.or(self.model_path_flag.as_ref())
179+
.is_none()
180+
{
181+
anyhow::bail!(
182+
"out=dyn://<path> requires --model-name and --model-path, which are the name and path on disk of the model we expect to serve."
183+
);
184+
}
185+
}
152186
Output::EchoFull => {}
153187
Output::EchoCore => {
154188
if !local_model.card().has_tokenizer() {

launch/dynamo-run/src/lib.rs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use anyhow::Context as _;
55
use dynamo_llm::entrypoint::input::Input;
66
use dynamo_llm::entrypoint::EngineConfig;
77
use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
8+
use dynamo_runtime::distributed::DistributedConfig;
89
use dynamo_runtime::CancellationToken;
910
use dynamo_runtime::{DistributedRuntime, Runtime};
1011

@@ -50,9 +51,13 @@ pub async fn run(
5051
if let Input::Endpoint(path) = &in_opt {
5152
builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
5253

53-
let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
54+
let dst_config = DistributedConfig::from_settings(flags.static_worker);
55+
let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
5456
rt = Either::Right(distributed_runtime);
5557
};
58+
if let Some(Output::Static(path)) = &out_opt {
59+
builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
60+
}
5661

5762
let local_model = builder.build().await?;
5863

@@ -64,7 +69,7 @@ pub async fn run(
6469
print_cuda(&out_opt);
6570

6671
// Now that we know the output we're targeting, check if we expect it to work
67-
flags.validate(&local_model, &out_opt)?;
72+
flags.validate(&local_model, &in_opt, &out_opt)?;
6873

6974
// Make an engine from the local_model, flags and output.
7075
let engine_config = engine_for(
@@ -94,24 +99,35 @@ async fn engine_for(
9499
rt: Either<Runtime, DistributedRuntime>,
95100
) -> anyhow::Result<EngineConfig> {
96101
match out_opt {
97-
Output::Dynamic => Ok(EngineConfig::Dynamic(Box::new(local_model))),
102+
Output::Auto => {
103+
// Auto-discover backends
104+
Ok(EngineConfig::Dynamic(Box::new(local_model)))
105+
}
106+
Output::Static(_) => {
107+
// A single static backend, no etcd
108+
Ok(EngineConfig::StaticRemote(Box::new(local_model)))
109+
}
98110
Output::EchoFull => Ok(EngineConfig::StaticFull {
99111
model: Box::new(local_model),
100112
engine: dynamo_llm::engines::make_engine_full(),
113+
is_static: flags.static_worker,
101114
}),
102115
Output::EchoCore => Ok(EngineConfig::StaticCore {
103116
engine: dynamo_llm::engines::make_engine_core(),
104117
model: Box::new(local_model),
118+
is_static: flags.static_worker,
105119
}),
106120
#[cfg(feature = "mistralrs")]
107121
Output::MistralRs => Ok(EngineConfig::StaticFull {
108122
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
109123
model: Box::new(local_model),
124+
is_static: flags.static_worker,
110125
}),
111126
#[cfg(feature = "llamacpp")]
112127
Output::LlamaCpp => Ok(EngineConfig::StaticCore {
113128
engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
114129
model: Box::new(local_model),
130+
is_static: flags.static_worker,
115131
}),
116132
Output::Mocker => {
117133
let Either::Right(drt) = rt else {
@@ -127,6 +143,7 @@ async fn engine_for(
127143
Ok(EngineConfig::StaticCore {
128144
engine,
129145
model: Box::new(local_model),
146+
is_static: flags.static_worker,
130147
})
131148
}
132149
}

launch/dynamo-run/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Example:
2424
- OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
2525
"#;
2626

27-
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--kv-cache-block-size=16] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
27+
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--kv-cache-block-size=16] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--static-worker] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
2828

2929
fn main() -> anyhow::Result<()> {
3030
// Set log level based on verbosity flag
@@ -134,5 +134,5 @@ fn is_in_dynamic(in_opt: &Input) -> bool {
134134
}
135135

136136
fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
137-
matches!(out_opt, Some(Output::Dynamic))
137+
matches!(out_opt, Some(Output::Auto) | Some(Output::Static(_)))
138138
}

launch/dynamo-run/src/opt.rs

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,14 @@ pub enum Output {
1212
EchoCore,
1313

1414
/// Listen for models on nats/etcd, add/remove dynamically
15-
Dynamic,
15+
Auto,
16+
17+
/// Static remote: The dyn://namespace.component.endpoint name of a remote worker we expect to
18+
/// exists. THIS DISABLES AUTO-DISCOVERY. Only this endpoint will be connected.
19+
/// `--model-name and `--model-path` must also be set.
20+
///
21+
/// A static remote setup avoids having to run etcd.
22+
Static(String),
1623

1724
#[cfg(feature = "mistralrs")]
1825
/// Run inference on a model in a GGUF file using mistralrs w/ candle
@@ -40,15 +47,11 @@ impl TryFrom<&str> for Output {
4047
"echo_full" => Ok(Output::EchoFull),
4148
"echo_core" => Ok(Output::EchoCore),
4249

43-
"dyn" => Ok(Output::Dynamic),
50+
"dyn" | "auto" => Ok(Output::Auto),
4451

45-
// Deprecated, should only use `out=dyn`
4652
endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
47-
tracing::warn!(
48-
"out=dyn://<path> is deprecated, the path is not used. Please use 'out=dyn'"
49-
);
50-
//let path = endpoint_path.strip_prefix(ENDPOINT_SCHEME).unwrap();
51-
Ok(Output::Dynamic)
53+
let path = endpoint_path.strip_prefix(ENDPOINT_SCHEME).unwrap();
54+
Ok(Output::Static(path.to_string()))
5255
}
5356

5457
e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
@@ -69,7 +72,8 @@ impl fmt::Display for Output {
6972
Output::EchoFull => "echo_full",
7073
Output::EchoCore => "echo_core",
7174

72-
Output::Dynamic => "dyn",
75+
Output::Auto => "auto",
76+
Output::Static(endpoint) => &format!("{ENDPOINT_SCHEME}{endpoint}"),
7377
};
7478
write!(f, "{s}")
7579
}

0 commit comments

Comments
 (0)