Skip to content

Commit b5b5792

Browse files
committed
debug
1 parent 7bb034c commit b5b5792

File tree

5 files changed

+78
-25
lines changed

5 files changed

+78
-25
lines changed

components/http/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ async fn app(runtime: Runtime) -> Result<()> {
5656
// the cli when operating on an `http` component will validate the namespace.component is
5757
// registered with HttpServiceComponentDefinition
5858

59-
let watch_obj = ModelWatcher::new(distributed.clone(), manager, RouterMode::Random);
59+
let watch_obj = ModelWatcher::new(distributed.clone(), manager, RouterMode::Random, None, None, None);
6060

6161
if let Some(etcd_client) = distributed.etcd_client() {
6262
let models_watcher: PrefixWatcher =

docs/guides/dynamo_run.md

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,37 @@
11
# Running Dynamo (`dynamo run`)
22

3-
* [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
4-
* [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
5-
* [Run a model from local file](#run-a-model-from-local-file)
6-
* [Distributed system](#distributed-system)
7-
* [Network names](#network-names)
8-
* [KV-aware routing](#kv-aware-routing)
9-
* [Full usage details](#full-usage-details)
10-
* [Setup](#setup)
11-
* [mistral.rs](#mistralrs)
12-
* [llama.cpp](#llamacpp)
13-
* [Sglang](#sglang)
14-
* [Vllm](#vllm)
15-
* [TensorRT-LLM](#trtllm)
16-
* [Echo Engines](#echo-engines)
17-
* [Writing your own engine in Python](#writing-your-own-engine-in-python)
18-
* [Batch mode](#batch-mode)
19-
* [Defaults](#defaults)
20-
* [Extra engine arguments](#extra-engine-arguments)
21-
3+
- [Running Dynamo (`dynamo run`)](#running-dynamo-dynamo-run)
4+
- [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
5+
- [Use model from Hugging Face](#use-model-from-hugging-face)
6+
- [Run a model from local file](#run-a-model-from-local-file)
7+
- [Download model from Hugging Face](#download-model-from-hugging-face)
8+
- [Run model from local file](#run-model-from-local-file)
9+
- [Distributed System](#distributed-system)
10+
- [Network names](#network-names)
11+
- [KV-aware routing](#kv-aware-routing)
12+
- [Full usage details](#full-usage-details)
13+
- [Getting Started](#getting-started)
14+
- [Setup](#setup)
15+
- [Step 1: Install libraries](#step-1-install-libraries)
16+
- [Step 2: Install Rust](#step-2-install-rust)
17+
- [Step 3: Build](#step-3-build)
18+
- [Defaults](#defaults)
19+
- [Running Inference with Pre-built Engines](#running-inference-with-pre-built-engines)
20+
- [mistralrs](#mistralrs)
21+
- [llamacpp](#llamacpp)
22+
- [sglang](#sglang)
23+
- [vllm](#vllm)
24+
- [trtllm](#trtllm)
25+
- [Step 1: Build the environment](#step-1-build-the-environment)
26+
- [Step 2: Run the environment](#step-2-run-the-environment)
27+
- [Step 3: Execute `dynamo run` command](#step-3-execute-dynamo-run-command)
28+
- [Echo Engines](#echo-engines)
29+
- [echo\_core](#echo_core)
30+
- [echo\_full](#echo_full)
31+
- [Configuration](#configuration)
32+
- [Batch mode](#batch-mode)
33+
- [Extra engine arguments](#extra-engine-arguments)
34+
- [Writing your own engine in Python](#writing-your-own-engine-in-python)
2235

2336
This guide explains the`dynamo run` command.
2437

@@ -28,7 +41,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
2841

2942
Usage:
3043
```
31-
dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv]
44+
dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0]
3245
```
3346

3447
Example: `dynamo run Qwen/Qwen3-0.6B`

launch/dynamo-run/src/input/common.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ pub async fn prepare_engine(
5050
distributed_runtime,
5151
model_manager.clone(),
5252
dynamo_runtime::pipeline::RouterMode::RoundRobin,
53+
None,
54+
None,
55+
None,
5356
));
5457
let models_watcher = etcd_client.kv_get_and_watch_prefix(MODEL_ROOT_PATH).await?;
5558
let (_prefix, _watcher, receiver) = models_watcher.dissolve();

launch/llmctl/src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,9 @@ async fn list_models(
258258
distributed.clone(),
259259
Arc::new(ModelManager::new()),
260260
RouterMode::Random,
261+
None,
262+
None,
263+
None,
261264
);
262265

263266
let mut models = Vec::new();
@@ -313,6 +316,9 @@ async fn remove_model(
313316
distributed.clone(),
314317
Arc::new(ModelManager::new()),
315318
RouterMode::Random,
319+
None,
320+
None,
321+
None,
316322
);
317323
let Some(etcd_client) = distributed.etcd_client() else {
318324
anyhow::bail!("llmctl is only useful with dynamic workers");

lib/llm/src/kv_router/scheduler.rs

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ impl KvScheduler {
9696
endpoints_rx: tokio::sync::watch::Receiver<ProcessedEndpoints>,
9797
selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
9898
) -> Result<Self, KvSchedulerError> {
99-
let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector));
99+
let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
100100
let mut endpoints_rx = endpoints_rx;
101101
let mut endpoints: ProcessedEndpoints = endpoints_rx.borrow_and_update().clone();
102102

@@ -231,8 +231,36 @@ pub fn process_worker_selection(
231231
}
232232

233233
// Default implementation matching the Python _cost_function
234-
#[derive(Default)]
235-
pub struct DefaultWorkerSelector;
234+
#[derive(Debug, Clone)]
235+
pub struct DefaultWorkerSelector {
236+
pub overlap_score_weight: f64,
237+
pub gpu_cache_usage_weight: f64,
238+
pub waiting_requests_weight: f64,
239+
}
240+
241+
impl Default for DefaultWorkerSelector {
242+
fn default() -> Self {
243+
Self {
244+
overlap_score_weight: 2.0,
245+
gpu_cache_usage_weight: 1.0,
246+
waiting_requests_weight: 1.0,
247+
}
248+
}
249+
}
250+
251+
impl DefaultWorkerSelector {
252+
pub fn new(
253+
overlap_score_weight: Option<f64>,
254+
gpu_cache_usage_weight: Option<f64>,
255+
waiting_requests_weight: Option<f64>,
256+
) -> Self {
257+
Self {
258+
overlap_score_weight: overlap_score_weight.unwrap_or(2.0),
259+
gpu_cache_usage_weight: gpu_cache_usage_weight.unwrap_or(1.0),
260+
waiting_requests_weight: waiting_requests_weight.unwrap_or(1.0),
261+
}
262+
}
263+
}
236264

237265
impl WorkerSelector for DefaultWorkerSelector {
238266
fn select_worker(
@@ -289,7 +317,10 @@ impl WorkerSelector for DefaultWorkerSelector {
289317
let logit = self.overlap_score_weight * score - self.gpu_cache_usage_weight * gpu_cache_usage - self.waiting_requests_weight * normalized_waiting;
290318

291319
tracing::trace!(
292-
"Formula for {worker_id}: {logit:.3} = {self.overlap_score_weight:.3} * {score:.3} - {self.gpu_cache_usage_weight:.3} * {gpu_cache_usage:.3} - {self.waiting_requests_weight:.3} * {normalized_waiting:.3}",
320+
"Formula for {worker_id}: {logit:.3} = {:.1} * {score:.3} - {:.1} * {gpu_cache_usage:.3} - {:.1} * {normalized_waiting:.3}",
321+
self.overlap_score_weight,
322+
self.gpu_cache_usage_weight,
323+
self.waiting_requests_weight,
293324
);
294325

295326
// Track best workers

0 commit comments

Comments
 (0)