Skip to content

Commit 19a77ae

Browse files
authored
chore(dynamo-run): Remove out=sglang|vllm|trtllm (#1920)
1 parent 3c500ae commit 19a77ae

File tree

18 files changed

+36
-3428
lines changed

18 files changed

+36
-3428
lines changed

launch/dynamo-run/src/flags.rs

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -64,46 +64,6 @@ pub struct Flags {
6464
#[arg(long)]
6565
pub model_config: Option<PathBuf>,
6666

67-
/// sglang, vllm
68-
///
69-
/// How many GPUs to use at once, total across all nodes.
70-
/// This must divide by num_nodes, and each node must use the same number of GPUs.
71-
#[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
72-
pub tensor_parallel_size: u32,
73-
74-
/// sglang only
75-
/// vllm uses CUDA_VISIBLE_DEVICES env var
76-
///
77-
/// Use GPUs from this ID upwards.
78-
/// If your machine has four GPUs but the first two (0 and 1) are in use,
79-
/// pass --base-gpu-id 2 to use the third GPU (and up, if tensor_parallel_size > 1)
80-
#[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..256))]
81-
pub base_gpu_id: u32,
82-
83-
/// vllm and sglang only
84-
///
85-
/// How many nodes/hosts to use
86-
#[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
87-
pub num_nodes: u32,
88-
89-
/// vllm and sglang only
90-
///
91-
/// This nodes' unique ID, running from 0 to num_nodes.
92-
#[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..255))]
93-
pub node_rank: u32,
94-
95-
/// For multi-node / pipeline parallel this is the <host>:<port> of the first node.
96-
///
97-
/// - vllm: The address/port of the Ray head node.
98-
///
99-
/// - sglang: The Torch Distributed init method address, in format <host>:<port>.
100-
/// It becomes "tcp://<host>:<port>" when given to torch.distributed.init_process_group.
101-
/// This expects to use the nccl backend (transparently to us here).
102-
/// All nodes must use the same address here, which is node_rank == 0's address.
103-
///
104-
#[arg(long)]
105-
pub leader_addr: Option<String>,
106-
10767
/// If using `out=dyn` with multiple instances, this says how to route the requests.
10868
///
10969
/// Mostly interesting for KV-aware routing.
@@ -199,22 +159,6 @@ impl Flags {
199159
}
200160
#[cfg(feature = "mistralrs")]
201161
Output::MistralRs => {}
202-
Output::SgLang => {
203-
if !local_model.path().is_dir() {
204-
// TODO GGUF support for sglang: https://github.com/ai-dynamo/dynamo/issues/572
205-
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
206-
}
207-
}
208-
Output::Vllm => {
209-
if self.base_gpu_id != 0 {
210-
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
211-
}
212-
}
213-
Output::Trtllm => {
214-
if self.base_gpu_id != 0 {
215-
anyhow::bail!("TRTLLM does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
216-
}
217-
}
218162
#[cfg(feature = "llamacpp")]
219163
Output::LlamaCpp => {
220164
if !local_model.path().is_file() {

launch/dynamo-run/src/lib.rs

Lines changed: 24 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
use std::time::Duration;
5-
use std::{future::Future, pin::Pin};
6-
74
use anyhow::Context as _;
85
use dynamo_llm::entrypoint::input::Input;
96
use dynamo_llm::entrypoint::EngineConfig;
@@ -17,9 +14,6 @@ pub use flags::Flags;
1714
mod opt;
1815
pub use dynamo_llm::request_template::RequestTemplate;
1916
pub use opt::Output;
20-
mod subprocess;
21-
22-
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
2317

2418
pub async fn run(
2519
runtime: Runtime,
@@ -48,6 +42,7 @@ pub async fn run(
4842
.request_template(flags.request_template.clone())
4943
.migration_limit(flags.migration_limit);
5044

45+
// TODO: old, address this later:
5146
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
5247
// If not, then the endpoint isn't exposed so we let LocalModel invent one.
5348
let mut rt = Either::Left(runtime.clone());
@@ -71,7 +66,7 @@ pub async fn run(
7166
flags.validate(&local_model, &out_opt)?;
7267

7368
// Make an engine from the local_model, flags and output.
74-
let (engine_config, extra) = engine_for(
69+
let engine_config = engine_for(
7570
runtime.primary_token(),
7671
out_opt,
7772
flags.clone(),
@@ -85,17 +80,9 @@ pub async fn run(
8580
//
8681
dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;
8782

88-
// Allow engines to ask main thread to wait on an extra future.
89-
// We use this to stop the vllm and sglang sub-process
90-
if let Some(extra) = extra {
91-
extra.await;
92-
}
93-
9483
Ok(())
9584
}
9685

97-
type ExtraFuture = Pin<Box<dyn Future<Output = ()> + Send>>;
98-
9986
/// Create the engine matching `out_opt`
10087
/// Note validation happens in Flags::validate. In here assume everything is going to work.
10188
async fn engine_for(
@@ -104,71 +91,27 @@ async fn engine_for(
10491
flags: Flags,
10592
local_model: LocalModel,
10693
rt: Either<Runtime, DistributedRuntime>,
107-
) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
94+
) -> anyhow::Result<EngineConfig> {
10895
match out_opt {
109-
Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)),
110-
Output::EchoFull => Ok((
111-
EngineConfig::StaticFull {
112-
model: Box::new(local_model),
113-
engine: dynamo_llm::engines::make_engine_full(),
114-
},
115-
None,
116-
)),
117-
Output::EchoCore => Ok((
118-
EngineConfig::StaticCore {
119-
engine: dynamo_llm::engines::make_engine_core(),
120-
model: Box::new(local_model),
121-
},
122-
None,
123-
)),
96+
Output::Dynamic => Ok(EngineConfig::Dynamic(Box::new(local_model))),
97+
Output::EchoFull => Ok(EngineConfig::StaticFull {
98+
model: Box::new(local_model),
99+
engine: dynamo_llm::engines::make_engine_full(),
100+
}),
101+
Output::EchoCore => Ok(EngineConfig::StaticCore {
102+
engine: dynamo_llm::engines::make_engine_core(),
103+
model: Box::new(local_model),
104+
}),
124105
#[cfg(feature = "mistralrs")]
125-
Output::MistralRs => Ok((
126-
EngineConfig::StaticFull {
127-
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
128-
model: Box::new(local_model),
129-
},
130-
None,
131-
)),
106+
Output::MistralRs => Ok(EngineConfig::StaticFull {
107+
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
108+
model: Box::new(local_model),
109+
}),
132110
#[cfg(feature = "llamacpp")]
133-
Output::LlamaCpp => Ok((
134-
EngineConfig::StaticCore {
135-
engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
136-
model: Box::new(local_model),
137-
},
138-
None,
139-
)),
140-
// For multi-node config. vllm uses `ray`, see guide
141-
Output::Vllm => shell(subprocess::vllm::PY, cancel_token, local_model, flags, None).await,
142-
// For multi-node config. trtlllm uses `mpi`, see guide
143-
Output::Trtllm => {
144-
shell(
145-
subprocess::trtllm::PY,
146-
cancel_token,
147-
local_model,
148-
flags,
149-
None,
150-
)
151-
.await
152-
}
153-
Output::SgLang => {
154-
let multi_node_config = if flags.num_nodes > 1 {
155-
Some(dynamo_llm::engines::MultiNodeConfig {
156-
num_nodes: flags.num_nodes,
157-
node_rank: flags.node_rank,
158-
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
159-
})
160-
} else {
161-
None
162-
};
163-
shell(
164-
subprocess::sglang::PY,
165-
cancel_token,
166-
local_model,
167-
flags,
168-
multi_node_config,
169-
)
170-
.await
171-
}
111+
Output::LlamaCpp => Ok(EngineConfig::StaticCore {
112+
engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
113+
model: Box::new(local_model),
114+
}),
172115
Output::Mocker => {
173116
let Either::Right(drt) = rt else {
174117
panic!("Mocker requires a distributed runtime to run.");
@@ -180,76 +123,12 @@ async fn engine_for(
180123
let engine =
181124
dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;
182125

183-
Ok((
184-
EngineConfig::StaticCore {
185-
engine,
186-
model: Box::new(local_model),
187-
},
188-
None,
189-
))
190-
}
191-
}
192-
}
193-
194-
async fn shell(
195-
py_script: &'static str,
196-
cancel_token: CancellationToken,
197-
local_model: LocalModel,
198-
flags: Flags,
199-
multi_node_config: Option<dynamo_llm::engines::MultiNodeConfig>,
200-
) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
201-
let (py_script, child) =
202-
match subprocess::start(py_script, &local_model, flags.clone(), multi_node_config).await {
203-
Ok(x) => x,
204-
Err(err) => {
205-
anyhow::bail!("Failed starting engine sub-process: {err}");
206-
}
207-
};
208-
209-
// Sub-process cleanup
210-
let extra: ExtraFuture = Box::pin(async move {
211-
stopper(cancel_token, child, py_script).await;
212-
});
213-
Ok((EngineConfig::Dynamic(Box::new(local_model)), Some(extra)))
214-
}
215-
216-
/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
217-
/// Keeps the TempPath alive until the child is stopped.
218-
async fn stopper(
219-
cancel_token: CancellationToken,
220-
mut child: tokio::process::Child,
221-
py_script: tempfile::TempPath,
222-
) {
223-
cancel_token.cancelled().await;
224-
225-
// Ask subprocess to stop gracefully
226-
if let Some(pid) = child.id() {
227-
unsafe { libc::kill(pid as i32, libc::SIGTERM) };
228-
}
229-
230-
tokio::select! {
231-
exit = child.wait() => {
232-
tracing::trace!("engine sub-process graceful exit");
233-
match exit {
234-
Ok(exit_status) if exit_status.success() => {}
235-
Ok(exit_status) => {
236-
// This is nearly always 15 (SIGTERM)
237-
tracing::trace!("engine sub-process non-0 exit: {exit_status}");
238-
}
239-
Err(err) => {
240-
tracing::warn!("engine sub-process error getting exit status: {err}");
241-
}
242-
}
243-
}
244-
_ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
245-
// It didn't stop in time, kill it
246-
child.kill().await.expect("Failed killing engine subprocess");
247-
let _ = child.wait().await;
126+
Ok(EngineConfig::StaticCore {
127+
engine,
128+
model: Box::new(local_model),
129+
})
248130
}
249131
}
250-
// This temporary file contains the python script running the engine. It deletes on drop.
251-
// Keep it alive until the engine has stopped.
252-
drop(py_script);
253132
}
254133

255134
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.

launch/dynamo-run/src/main.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
9090
in_opt = Some(val.try_into()?);
9191
}
9292
"out" => {
93+
if val == "sglang" || val == "trtllm" || val == "vllm" {
94+
tracing::error!("To run the {val} engine please use the Python interface, see root README or look in directory `components/backends/`.");
95+
std::process::exit(1);
96+
}
97+
9398
out_opt = Some(val.try_into()?);
9499
}
95100
_ => {

launch/dynamo-run/src/opt.rs

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ pub enum Output {
2222
/// Run inference using llama.cpp
2323
LlamaCpp,
2424

25-
/// Run inference using sglang
26-
SgLang,
27-
28-
/// Run inference using trtllm
29-
Trtllm,
30-
31-
// Start vllm in a sub-process connecting via nats
32-
// Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
33-
Vllm,
34-
3525
Mocker,
3626
}
3727

@@ -46,11 +36,7 @@ impl TryFrom<&str> for Output {
4636
#[cfg(feature = "llamacpp")]
4737
"llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
4838

49-
"sglang" => Ok(Output::SgLang),
50-
"trtllm" => Ok(Output::Trtllm),
51-
"vllm" => Ok(Output::Vllm),
5239
"mocker" => Ok(Output::Mocker),
53-
5440
"echo_full" => Ok(Output::EchoFull),
5541
"echo_core" => Ok(Output::EchoCore),
5642

@@ -79,11 +65,7 @@ impl fmt::Display for Output {
7965
#[cfg(feature = "llamacpp")]
8066
Output::LlamaCpp => "llamacpp",
8167

82-
Output::SgLang => "sglang",
83-
Output::Trtllm => "trtllm",
84-
Output::Vllm => "vllm",
8568
Output::Mocker => "mocker",
86-
8769
Output::EchoFull => "echo_full",
8870
Output::EchoCore => "echo_core",
8971

@@ -96,7 +78,11 @@ impl fmt::Display for Output {
9678
impl Output {
9779
#[allow(unused_mut)]
9880
pub fn available_engines() -> Vec<String> {
99-
let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
81+
let mut out = vec![
82+
"echo_core".to_string(),
83+
"echo_full".to_string(),
84+
Output::Mocker.to_string(),
85+
];
10086
#[cfg(feature = "mistralrs")]
10187
{
10288
out.push(Output::MistralRs.to_string());
@@ -107,11 +93,6 @@ impl Output {
10793
out.push(Output::LlamaCpp.to_string());
10894
}
10995

110-
out.push(Output::SgLang.to_string());
111-
out.push(Output::Trtllm.to_string());
112-
out.push(Output::Vllm.to_string());
113-
out.push(Output::Mocker.to_string());
114-
11596
out
11697
}
11798
}

0 commit comments

Comments
 (0)