debug

tedzhouhk · tedzhouhk · commit b5b57927cbfa · 2025-05-28T18:12:08.000-07:00
diff --git a/components/http/src/main.rs b/components/http/src/main.rs
@@ -56,7 +56,7 @@ async fn app(runtime: Runtime) -> Result<()> {
     // the cli when operating on an `http` component will validate the namespace.component is
     // registered with HttpServiceComponentDefinition
 
-    let watch_obj = ModelWatcher::new(distributed.clone(), manager, RouterMode::Random);
+    let watch_obj = ModelWatcher::new(distributed.clone(), manager, RouterMode::Random, None, None, None);
 
     if let Some(etcd_client) = distributed.etcd_client() {
         let models_watcher: PrefixWatcher =
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -1,24 +1,37 @@
 # Running Dynamo (`dynamo run`)
 
-* [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
-    * [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
-    * [Run a model from local file](#run-a-model-from-local-file)
-    * [Distributed system](#distributed-system)
-    * [Network names](#network-names)
-    * [KV-aware routing](#kv-aware-routing)
-* [Full usage details](#full-usage-details)
-    * [Setup](#setup)
-    * [mistral.rs](#mistralrs)
-    * [llama.cpp](#llamacpp)
-    * [Sglang](#sglang)
-    * [Vllm](#vllm)
-    * [TensorRT-LLM](#trtllm)
-    * [Echo Engines](#echo-engines)
-    * [Writing your own engine in Python](#writing-your-own-engine-in-python)
-* [Batch mode](#batch-mode)
-* [Defaults](#defaults)
-* [Extra engine arguments](#extra-engine-arguments)
-
+- [Running Dynamo (`dynamo run`)](#running-dynamo-dynamo-run)
+  - [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
+    - [Use model from Hugging Face](#use-model-from-hugging-face)
+    - [Run a model from local file](#run-a-model-from-local-file)
+      - [Download model from Hugging Face](#download-model-from-hugging-face)
+      - [Run model from local file](#run-model-from-local-file)
+    - [Distributed System](#distributed-system)
+    - [Network names](#network-names)
+    - [KV-aware routing](#kv-aware-routing)
+  - [Full usage details](#full-usage-details)
+    - [Getting Started](#getting-started)
+      - [Setup](#setup)
+        - [Step 1: Install libraries](#step-1-install-libraries)
+        - [Step 2: Install Rust](#step-2-install-rust)
+        - [Step 3: Build](#step-3-build)
+      - [Defaults](#defaults)
+    - [Running Inference with Pre-built Engines](#running-inference-with-pre-built-engines)
+      - [mistralrs](#mistralrs)
+      - [llamacpp](#llamacpp)
+      - [sglang](#sglang)
+      - [vllm](#vllm)
+      - [trtllm](#trtllm)
+        - [Step 1: Build the environment](#step-1-build-the-environment)
+        - [Step 2: Run the environment](#step-2-run-the-environment)
+        - [Step 3: Execute `dynamo run` command](#step-3-execute-dynamo-run-command)
+      - [Echo Engines](#echo-engines)
+        - [echo\_core](#echo_core)
+        - [echo\_full](#echo_full)
+        - [Configuration](#configuration)
+      - [Batch mode](#batch-mode)
+    - [Extra engine arguments](#extra-engine-arguments)
+    - [Writing your own engine in Python](#writing-your-own-engine-in-python)
 
 This guide explains the`dynamo run` command.
 
@@ -28,7 +41,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
diff --git a/launch/dynamo-run/src/input/common.rs b/launch/dynamo-run/src/input/common.rs
@@ -50,6 +50,9 @@ pub async fn prepare_engine(
                 distributed_runtime,
                 model_manager.clone(),
                 dynamo_runtime::pipeline::RouterMode::RoundRobin,
+                None,
+                None,
+                None,
             ));
             let models_watcher = etcd_client.kv_get_and_watch_prefix(MODEL_ROOT_PATH).await?;
             let (_prefix, _watcher, receiver) = models_watcher.dissolve();
diff --git a/launch/llmctl/src/main.rs b/launch/llmctl/src/main.rs
@@ -258,6 +258,9 @@ async fn list_models(
         distributed.clone(),
         Arc::new(ModelManager::new()),
         RouterMode::Random,
+        None,
+        None,
+        None,
     );
 
     let mut models = Vec::new();
@@ -313,6 +316,9 @@ async fn remove_model(
         distributed.clone(),
         Arc::new(ModelManager::new()),
         RouterMode::Random,
+        None,
+        None,
+        None,
     );
     let Some(etcd_client) = distributed.etcd_client() else {
         anyhow::bail!("llmctl is only useful with dynamic workers");
diff --git a/lib/llm/src/kv_router/scheduler.rs b/lib/llm/src/kv_router/scheduler.rs
@@ -96,7 +96,7 @@ impl KvScheduler {
         endpoints_rx: tokio::sync::watch::Receiver<ProcessedEndpoints>,
         selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
     ) -> Result<Self, KvSchedulerError> {
-        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector));
+        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
         let mut endpoints_rx = endpoints_rx;
         let mut endpoints: ProcessedEndpoints = endpoints_rx.borrow_and_update().clone();
 
@@ -231,8 +231,36 @@ pub fn process_worker_selection(
 }
 
 // Default implementation matching the Python _cost_function
-#[derive(Default)]
-pub struct DefaultWorkerSelector;
+#[derive(Debug, Clone)]
+pub struct DefaultWorkerSelector {
+    pub overlap_score_weight: f64,
+    pub gpu_cache_usage_weight: f64,
+    pub waiting_requests_weight: f64,
+}
+
+impl Default for DefaultWorkerSelector {
+    fn default() -> Self {
+        Self {
+            overlap_score_weight: 2.0,
+            gpu_cache_usage_weight: 1.0,
+            waiting_requests_weight: 1.0,
+        }
+    }
+}
+
+impl DefaultWorkerSelector {
+    pub fn new(
+        overlap_score_weight: Option<f64>,
+        gpu_cache_usage_weight: Option<f64>,
+        waiting_requests_weight: Option<f64>,
+    ) -> Self {
+        Self {
+            overlap_score_weight: overlap_score_weight.unwrap_or(2.0),
+            gpu_cache_usage_weight: gpu_cache_usage_weight.unwrap_or(1.0),
+            waiting_requests_weight: waiting_requests_weight.unwrap_or(1.0),
+        }
+    }
+}
 
 impl WorkerSelector for DefaultWorkerSelector {
     fn select_worker(
@@ -289,7 +317,10 @@ impl WorkerSelector for DefaultWorkerSelector {
             let logit = self.overlap_score_weight * score - self.gpu_cache_usage_weight * gpu_cache_usage - self.waiting_requests_weight * normalized_waiting;
 
             tracing::trace!(
-                "Formula for {worker_id}: {logit:.3} = {self.overlap_score_weight:.3} * {score:.3} - {self.gpu_cache_usage_weight:.3} * {gpu_cache_usage:.3} - {self.waiting_requests_weight:.3} * {normalized_waiting:.3}",
+                "Formula for {worker_id}: {logit:.3} = {:.1} * {score:.3} - {:.1} * {gpu_cache_usage:.3} - {:.1} * {normalized_waiting:.3}",
+                self.overlap_score_weight,
+                self.gpu_cache_usage_weight, 
+                self.waiting_requests_weight,
             );
 
             // Track best workers