ai-dynamo
diff --git a/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 0 additions & 56 deletions b/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 0 additions & 56 deletions
diff --git a/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 24 additions & 145 deletions b/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 24 additions & 145 deletions
diff --git a/‎launch/dynamo-run/src/main.rs‎
Lines changed: 5 additions & 0 deletions b/‎launch/dynamo-run/src/main.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎launch/dynamo-run/src/opt.rs‎
Lines changed: 5 additions & 24 deletions b/‎launch/dynamo-run/src/opt.rs‎
Lines changed: 5 additions & 24 deletions
@@ -64,46 +64,6 @@ pub struct Flags {
     #[arg(long)]
     pub model_config: Option<PathBuf>,
 
-    /// sglang, vllm
-    ///
-    /// How many GPUs to use at once, total across all nodes.
-    /// This must divide by num_nodes, and each node must use the same number of GPUs.
-    #[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
-    pub tensor_parallel_size: u32,
-
-    /// sglang only
-    /// vllm uses CUDA_VISIBLE_DEVICES env var
-    ///
-    /// Use GPUs from this ID upwards.
-    /// If your machine has four GPUs but the first two (0 and 1) are in use,
-    /// pass --base-gpu-id 2 to use the third GPU (and up, if tensor_parallel_size > 1)
-    #[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..256))]
-    pub base_gpu_id: u32,
-
-    /// vllm and sglang only
-    ///
-    /// How many nodes/hosts to use
-    #[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
-    pub num_nodes: u32,
-
-    /// vllm and sglang only
-    ///
-    /// This nodes' unique ID, running from 0 to num_nodes.
-    #[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..255))]
-    pub node_rank: u32,
-
-    /// For multi-node / pipeline parallel this is the <host>:<port> of the first node.
-    ///
-    /// - vllm: The address/port of the Ray head node.
-    ///
-    /// - sglang: The Torch Distributed init method address, in format <host>:<port>.
-    ///   It becomes "tcp://<host>:<port>" when given to torch.distributed.init_process_group.
-    ///   This expects to use the nccl backend (transparently to us here).
-    ///   All nodes must use the same address here, which is node_rank == 0's address.
-    ///
-    #[arg(long)]
-    pub leader_addr: Option<String>,
-
     /// If using `out=dyn` with multiple instances, this says how to route the requests.
     ///
     /// Mostly interesting for KV-aware routing.
@@ -199,22 +159,6 @@ impl Flags {
             }
             #[cfg(feature = "mistralrs")]
             Output::MistralRs => {}
-            Output::SgLang => {
-                if !local_model.path().is_dir() {
-                    // TODO GGUF support for sglang: https://github.com/ai-dynamo/dynamo/issues/572
-                    anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
-                }
-            }
-            Output::Vllm => {
-                if self.base_gpu_id != 0 {
-                    anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-                }
-            }
-            Output::Trtllm => {
-                if self.base_gpu_id != 0 {
-                    anyhow::bail!("TRTLLM does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-                }
-            }
             #[cfg(feature = "llamacpp")]
             Output::LlamaCpp => {
                 if !local_model.path().is_file() {
 
@@ -1,9 +1,6 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use std::time::Duration;
-use std::{future::Future, pin::Pin};
-
 use anyhow::Context as _;
 use dynamo_llm::entrypoint::input::Input;
 use dynamo_llm::entrypoint::EngineConfig;
@@ -17,9 +14,6 @@ pub use flags::Flags;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::Output;
-mod subprocess;
-
-const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 
 pub async fn run(
     runtime: Runtime,
@@ -48,6 +42,7 @@ pub async fn run(
         .request_template(flags.request_template.clone())
         .migration_limit(flags.migration_limit);
 
+    // TODO: old, address this later:
     // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
     // If not, then the endpoint isn't exposed so we let LocalModel invent one.
     let mut rt = Either::Left(runtime.clone());
@@ -71,7 +66,7 @@ pub async fn run(
     flags.validate(&local_model, &out_opt)?;
 
     // Make an engine from the local_model, flags and output.
-    let (engine_config, extra) = engine_for(
+    let engine_config = engine_for(
         runtime.primary_token(),
         out_opt,
         flags.clone(),
@@ -85,17 +80,9 @@ pub async fn run(
     //
     dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;
 
-    // Allow engines to ask main thread to wait on an extra future.
-    // We use this to stop the vllm and sglang sub-process
-    if let Some(extra) = extra {
-        extra.await;
-    }
-
     Ok(())
 }
 
-type ExtraFuture = Pin<Box<dyn Future<Output = ()> + Send>>;
-
 /// Create the engine matching `out_opt`
 /// Note validation happens in Flags::validate. In here assume everything is going to work.
 async fn engine_for(
@@ -104,71 +91,27 @@ async fn engine_for(
     flags: Flags,
     local_model: LocalModel,
     rt: Either<Runtime, DistributedRuntime>,
-) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
+) -> anyhow::Result<EngineConfig> {
     match out_opt {
-        Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)),
-        Output::EchoFull => Ok((
-            EngineConfig::StaticFull {
-                model: Box::new(local_model),
-                engine: dynamo_llm::engines::make_engine_full(),
-            },
-            None,
-        )),
-        Output::EchoCore => Ok((
-            EngineConfig::StaticCore {
-                engine: dynamo_llm::engines::make_engine_core(),
-                model: Box::new(local_model),
-            },
-            None,
-        )),
+        Output::Dynamic => Ok(EngineConfig::Dynamic(Box::new(local_model))),
+        Output::EchoFull => Ok(EngineConfig::StaticFull {
+            model: Box::new(local_model),
+            engine: dynamo_llm::engines::make_engine_full(),
+        }),
+        Output::EchoCore => Ok(EngineConfig::StaticCore {
+            engine: dynamo_llm::engines::make_engine_core(),
+            model: Box::new(local_model),
+        }),
         #[cfg(feature = "mistralrs")]
-        Output::MistralRs => Ok((
-            EngineConfig::StaticFull {
-                engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
-                model: Box::new(local_model),
-            },
-            None,
-        )),
+        Output::MistralRs => Ok(EngineConfig::StaticFull {
+            engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
+            model: Box::new(local_model),
+        }),
         #[cfg(feature = "llamacpp")]
-        Output::LlamaCpp => Ok((
-            EngineConfig::StaticCore {
-                engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
-                model: Box::new(local_model),
-            },
-            None,
-        )),
-        // For multi-node config. vllm uses `ray`, see guide
-        Output::Vllm => shell(subprocess::vllm::PY, cancel_token, local_model, flags, None).await,
-        // For multi-node config. trtlllm uses `mpi`, see guide
-        Output::Trtllm => {
-            shell(
-                subprocess::trtllm::PY,
-                cancel_token,
-                local_model,
-                flags,
-                None,
-            )
-            .await
-        }
-        Output::SgLang => {
-            let multi_node_config = if flags.num_nodes > 1 {
-                Some(dynamo_llm::engines::MultiNodeConfig {
-                    num_nodes: flags.num_nodes,
-                    node_rank: flags.node_rank,
-                    leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-                })
-            } else {
-                None
-            };
-            shell(
-                subprocess::sglang::PY,
-                cancel_token,
-                local_model,
-                flags,
-                multi_node_config,
-            )
-            .await
-        }
+        Output::LlamaCpp => Ok(EngineConfig::StaticCore {
+            engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
+            model: Box::new(local_model),
+        }),
         Output::Mocker => {
             let Either::Right(drt) = rt else {
                 panic!("Mocker requires a distributed runtime to run.");
@@ -180,76 +123,12 @@ async fn engine_for(
             let engine =
                 dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;
 
-            Ok((
-                EngineConfig::StaticCore {
-                    engine,
-                    model: Box::new(local_model),
-                },
-                None,
-            ))
-        }
-    }
-}
-
-async fn shell(
-    py_script: &'static str,
-    cancel_token: CancellationToken,
-    local_model: LocalModel,
-    flags: Flags,
-    multi_node_config: Option<dynamo_llm::engines::MultiNodeConfig>,
-) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
-    let (py_script, child) =
-        match subprocess::start(py_script, &local_model, flags.clone(), multi_node_config).await {
-            Ok(x) => x,
-            Err(err) => {
-                anyhow::bail!("Failed starting engine sub-process: {err}");
-            }
-        };
-
-    // Sub-process cleanup
-    let extra: ExtraFuture = Box::pin(async move {
-        stopper(cancel_token, child, py_script).await;
-    });
-    Ok((EngineConfig::Dynamic(Box::new(local_model)), Some(extra)))
-}
-
-/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
-/// Keeps the TempPath alive until the child is stopped.
-async fn stopper(
-    cancel_token: CancellationToken,
-    mut child: tokio::process::Child,
-    py_script: tempfile::TempPath,
-) {
-    cancel_token.cancelled().await;
-
-    // Ask subprocess to stop gracefully
-    if let Some(pid) = child.id() {
-        unsafe { libc::kill(pid as i32, libc::SIGTERM) };
-    }
-
-    tokio::select! {
-        exit = child.wait() => {
-            tracing::trace!("engine sub-process graceful exit");
-            match exit {
-                Ok(exit_status) if exit_status.success() => {}
-                Ok(exit_status) => {
-                    // This is nearly always 15 (SIGTERM)
-                    tracing::trace!("engine sub-process non-0 exit: {exit_status}");
-                }
-                Err(err) => {
-                    tracing::warn!("engine sub-process error getting exit status: {err}");
-                }
-            }
-        }
-        _ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
-            // It didn't stop in time, kill it
-            child.kill().await.expect("Failed killing engine subprocess");
-            let _ = child.wait().await;
+            Ok(EngineConfig::StaticCore {
+                engine,
+                model: Box::new(local_model),
+            })
         }
     }
-    // This temporary file contains the python script running the engine. It deletes on drop.
-    // Keep it alive until the engine has stopped.
-    drop(py_script);
 }
 
 /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
 
@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
                 in_opt = Some(val.try_into()?);
             }
             "out" => {
+                if val == "sglang" || val == "trtllm" || val == "vllm" {
+                    tracing::error!("To run the {val} engine please use the Python interface, see root README or look in directory `components/backends/`.");
+                    std::process::exit(1);
+                }
+
                 out_opt = Some(val.try_into()?);
             }
             _ => {
 
@@ -22,16 +22,6 @@ pub enum Output {
     /// Run inference using llama.cpp
     LlamaCpp,
 
-    /// Run inference using sglang
-    SgLang,
-
-    /// Run inference using trtllm
-    Trtllm,
-
-    // Start vllm in a sub-process connecting via nats
-    // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
-    Vllm,
-
     Mocker,
 }
 
@@ -46,11 +36,7 @@ impl TryFrom<&str> for Output {
             #[cfg(feature = "llamacpp")]
             "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
 
-            "sglang" => Ok(Output::SgLang),
-            "trtllm" => Ok(Output::Trtllm),
-            "vllm" => Ok(Output::Vllm),
             "mocker" => Ok(Output::Mocker),
-
             "echo_full" => Ok(Output::EchoFull),
             "echo_core" => Ok(Output::EchoCore),
 
@@ -79,11 +65,7 @@ impl fmt::Display for Output {
             #[cfg(feature = "llamacpp")]
             Output::LlamaCpp => "llamacpp",
 
-            Output::SgLang => "sglang",
-            Output::Trtllm => "trtllm",
-            Output::Vllm => "vllm",
             Output::Mocker => "mocker",
-
             Output::EchoFull => "echo_full",
             Output::EchoCore => "echo_core",
 
@@ -96,7 +78,11 @@ impl fmt::Display for Output {
 impl Output {
     #[allow(unused_mut)]
     pub fn available_engines() -> Vec<String> {
-        let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
+        let mut out = vec![
+            "echo_core".to_string(),
+            "echo_full".to_string(),
+            Output::Mocker.to_string(),
+        ];
         #[cfg(feature = "mistralrs")]
         {
             out.push(Output::MistralRs.to_string());
@@ -107,11 +93,6 @@ impl Output {
             out.push(Output::LlamaCpp.to_string());
         }
 
-        out.push(Output::SgLang.to_string());
-        out.push(Output::Trtllm.to_string());
-        out.push(Output::Vllm.to_string());
-        out.push(Output::Mocker.to_string());
-
         out
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {`
`90`	`90`	`in_opt = Some(val.try_into()?);`
`91`	`91`	`}`
`92`	`92`	`"out" => {`
	`93`	`+ if val == "sglang" \|\| val == "trtllm" \|\| val == "vllm" {`
	`94`	+ tracing::error!("To run the {val} engine please use the Python interface, see root README or look in directory `components/backends/`.");
	`95`	`+ std::process::exit(1);`
	`96`	`+ }`
	`97`	`+`
`93`	`98`	`out_opt = Some(val.try_into()?);`
`94`	`99`	`}`
`95`	`100`	`_ => {`