feat: prefill aware routing (#1895)

PeaBrane · ZichengMa · commit 644a68d89678 · 2025-07-17T15:36:44.000-07:00
diff --git a/components/metrics/src/bin/mock_worker.rs b/components/metrics/src/bin/mock_worker.rs
@@ -93,7 +93,7 @@ async fn mock_event_publisher(namespace: Namespace) {
         let event = KVHitRateEvent {
             worker_id,
             isl_blocks,
-            overlap_blocks,
+            overlap_blocks: overlap_blocks as u32,
         };
 
         if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
diff --git a/components/metrics/src/main.rs b/components/metrics/src/main.rs
@@ -199,7 +199,7 @@ async fn app(runtime: Runtime) -> Result<()> {
                                 &config_clone,
                                 event.worker_id,
                                 event.isl_blocks,
-                                event.overlap_blocks,
+                                event.overlap_blocks as usize,
                             );
                         }
                         Err(e) => {
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=1.0] [--router-temperature=0.5] [--use-kv-events=true] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=1.0] [--router-temperature=0.0] [--use-kv-events=true] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
@@ -118,13 +118,13 @@ pub struct Flags {
     pub max_num_batched_tokens: Option<u32>,
 
     /// KV Router: Weight for overlap score in worker selection.
-    /// Higher values prioritize KV cache reuse. Default: 2.0
+    /// Higher values prioritize KV cache reuse. Default: 1.0
     #[arg(long)]
     pub kv_overlap_score_weight: Option<f64>,
 
     /// KV Router: Temperature for worker sampling via softmax.
     /// Higher values promote more randomness, and 0 fallbacks to deterministic.
-    /// Default: 0.5
+    /// Default: 0.0
     #[arg(long)]
     pub router_temperature: Option<f64>,
 
diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs
@@ -78,7 +78,7 @@ impl Default for KvRouterConfig {
     fn default() -> Self {
         Self {
             overlap_score_weight: 1.0,
-            router_temperature: 0.5,
+            router_temperature: 0.0,
             use_kv_events: true,
             max_num_batched_tokens: 8192,
         }
@@ -337,6 +337,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
                     let mut accumulated_tokens = Vec::new();
                     let mut total_output_length = 0usize;
                     let mut last_block_index = (isl.saturating_sub(1)) / block_size;
+                    let mut first_push_done = false;
 
                     while let Some(item) = response_stream.next().await {
                         // Track tokens if they exist in the response
@@ -353,12 +354,19 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
                         accumulated_tokens.extend_from_slice(&output.token_ids);
                         total_output_length += output.token_ids.len();
 
-                        // Check if we've moved to a new block
+                        // Always push for the first generated token (to mark prefill done)
+                        // or when we've moved to a new block
                         let current_block_index = (isl + total_output_length).saturating_sub(1) / block_size;
-                        if current_block_index > last_block_index {
+                        let should_push = (!first_push_done && total_output_length >= 1) ||
+                                      (first_push_done && current_block_index > last_block_index);
+
+                        if should_push {
                             chooser.push(&request_id, &accumulated_tokens).await;
                             accumulated_tokens.clear();
                             last_block_index = current_block_index;
+                            if !first_push_done {
+                                first_push_done = true;
+                            }
                         }
 
                         yield item;
diff --git a/lib/llm/src/kv_router/protocols.rs b/lib/llm/src/kv_router/protocols.rs
@@ -36,7 +36,7 @@ pub struct WorkerSelectionResult {
 
     /// The number of blocks that the selected worker may already have cached.
     /// This is not a guarantee, but an estimate.
-    pub overlap_blocks: usize,
+    pub overlap_blocks: u32,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
diff --git a/lib/llm/src/kv_router/scheduler.rs b/lib/llm/src/kv_router/scheduler.rs
@@ -25,7 +25,6 @@ use tokio::sync::Mutex;
 use super::protocols::WorkerSelectionResult;
 use super::WorkerSelector;
 use crate::kv_router::indexer::OverlapScores;
-use crate::kv_router::indexer::WorkerId;
 use crate::kv_router::protocols::LoadMetrics;
 use crate::kv_router::scoring::ProcessedEndpoints;
 use crate::kv_router::sequence::ActiveSequencesMultiWorker;
@@ -37,7 +36,7 @@ use crate::tokens::TokenBlockSequence;
 pub struct KVHitRateEvent {
     pub worker_id: i64,
     pub isl_blocks: usize,
-    pub overlap_blocks: usize,
+    pub overlap_blocks: u32,
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -79,13 +78,15 @@ impl Endpoint {
 #[derive(Debug)]
 pub struct SchedulingResponse {
     pub best_worker_id: i64,
+    pub overlap_blocks: u32, // Add this field
     pub endpoints_changed: Option<Vec<i64>>,
 }
 
 pub struct SchedulingRequest {
     pub isl_tokens: usize,
-    pub overlap: OverlapScores,
+    pub overlaps: OverlapScores,
     pub potential_blocks: HashMap<i64, usize>,
+    pub potential_tokens: HashMap<i64, usize>,
     resp_tx: tokio::sync::oneshot::Sender<SchedulingResponse>,
 }
 
@@ -174,6 +175,7 @@ impl KvScheduler {
 
                             let response = SchedulingResponse {
                                 best_worker_id: selection.worker_id,
+                                overlap_blocks: selection.overlap_blocks,
                                 endpoints_changed: pending_endpoint_update.take(),
                             };
                             request.respond(response);
@@ -207,18 +209,20 @@ impl KvScheduler {
         isl_tokens: usize,
         block_size: u32,
         tokens: &[u32],
-        overlap: OverlapScores,
+        overlaps: OverlapScores,
     ) -> Result<i64, KvSchedulerError> {
         let mut sequences = self.sequences.lock().await;
 
         let token_sequence = TokenBlockSequence::from_slice(tokens, block_size, None);
-        let potential_blocks = sequences.potential_blocks(token_sequence);
+        let (potential_blocks, potential_tokens) =
+            sequences.potential_blocks_and_tokens(token_sequence, overlaps.clone());
 
         let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
         let request = SchedulingRequest {
             isl_tokens,
-            overlap,
+            overlaps,
             potential_blocks,
+            potential_tokens,
             resp_tx,
         };
         self.request_tx
@@ -234,31 +238,16 @@ impl KvScheduler {
         }
 
         let token_sequence = TokenBlockSequence::from_slice(tokens, block_size, None);
-        sequences.add_request(request_id, token_sequence, response.best_worker_id);
+        sequences.add_request(
+            request_id,
+            token_sequence,
+            response.overlap_blocks,
+            response.best_worker_id,
+        );
 
         Ok(response.best_worker_id)
     }
 
-    /// Find the potential blocks for each worker if the sequence were routed there
-    pub async fn potential_blocks(
-        &self,
-        token_sequence: TokenBlockSequence,
-    ) -> HashMap<i64, usize> {
-        let sequences = self.sequences.lock().await;
-        sequences.potential_blocks(token_sequence)
-    }
-
-    /// Add a new request with its initial tokens to a specific worker
-    pub async fn add_request(
-        &self,
-        request_id: String,
-        token_sequence: TokenBlockSequence,
-        worker_id: WorkerId,
-    ) {
-        let mut sequences = self.sequences.lock().await;
-        sequences.add_request(request_id, token_sequence, worker_id)
-    }
-
     /// Push tokens to a specific request's sequence
     pub async fn push(&self, request_id: &String, tokens: &[u32]) {
         let mut sequences = self.sequences.lock().await;
@@ -370,34 +359,47 @@ impl WorkerSelector for DefaultWorkerSelector {
             return Err(KvSchedulerError::NoEndpoints);
         }
 
-        let request_blocks = request.isl_tokens.div_ceil(block_size as usize);
+        let isl = request.isl_tokens;
+        let request_blocks = isl.div_ceil(block_size as usize);
+        let overlaps = &request.overlaps.scores;
+
+        // active blocks for decoding
         let potential_active_blocks = &request.potential_blocks;
+        // active tokens in the batch (processed by the linear layers), mostly prefill tokens
+        let potential_active_tokens = &request.potential_tokens;
 
         let mut worker_logits = HashMap::new();
         let mut max_logit = f64::NEG_INFINITY;
 
         // Calculate logits for each worker
         for (worker_id, _) in workers.endpoints.iter() {
-            let cached_blocks = request.overlap.scores.get(worker_id).copied().unwrap_or(0) as f64;
-            let prefill_blocks = request_blocks as f64 - cached_blocks;
+            // this is the number of tokens each worker would have if the request were scheduled there
+            let potential_tokens = *potential_active_tokens.get(worker_id).unwrap_or_else(|| {
+                tracing::warn!(
+                    "assuming {isl} tokens for {worker_id}, as the endpoint does not exist yet"
+                );
+                &isl
+            }) as f64;
 
             // this is the number of blocks each worker would have if the request were scheduled there
             let potential_blocks = *potential_active_blocks.get(worker_id).unwrap_or_else(||
-                {tracing::warn!("assuming 0 decoding blocks for {worker_id}, as the load metrics endpoint does not exist yet");
-                &0
+                {tracing::warn!("assuming {request_blocks} decoding blocks for {worker_id}, as the endpoint does not exist yet");
+                &request_blocks
             }) as f64;
 
+            let potential_prefill_blocks = potential_tokens / (block_size as f64);
+
             // Calculate logit (lower is better)
-            let logit =
-                self.kv_router_config.overlap_score_weight * prefill_blocks + potential_blocks;
+            let logit = self.kv_router_config.overlap_score_weight * potential_prefill_blocks
+                + potential_blocks;
             max_logit = max_logit.max(logit);
 
             worker_logits.insert(*worker_id, logit);
 
             tracing::info!(
-                "Formula for {worker_id}: {logit:.3} = {:.1} * {prefill_blocks:.3} + {potential_blocks:.3}  (cached_blocks: {cached_blocks})",
+                "Formula for {worker_id}: {logit:.3} = {:.1} * {potential_prefill_blocks:.3} + {potential_blocks:.3}  (cached_blocks: {})",
                 self.kv_router_config.overlap_score_weight,
-                cached_blocks = cached_blocks
+                overlaps.get(worker_id).unwrap_or(&0),
             );
         }
 
@@ -412,12 +414,7 @@ impl WorkerSelector for DefaultWorkerSelector {
         let temperature = self.kv_router_config.router_temperature;
         let best_worker_id = softmax_sample(&worker_logits, temperature);
 
-        let overlap_blocks = request
-            .overlap
-            .scores
-            .get(&best_worker_id)
-            .copied()
-            .unwrap_or(0) as usize;
+        let overlap_blocks = overlaps.get(&best_worker_id).copied().unwrap_or(0);
         let best_logit = worker_logits[&best_worker_id];
 
         tracing::info!(
diff --git a/lib/llm/src/kv_router/sequence.rs b/lib/llm/src/kv_router/sequence.rs

Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@ async fn app(runtime: Runtime) -> Result<()> {`
`199`	`199`	`&config_clone,`
`200`	`200`	`event.worker_id,`
`201`	`201`	`event.isl_blocks,`
`202`		`- event.overlap_blocks,`
	`202`	`+ event.overlap_blocks as usize,`
`203`	`203`	`);`
`204`	`204`	`}`
`205`	`205`	`Err(e) => {`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ pub struct WorkerSelectionResult {`
`36`	`36`
`37`	`37`	`/// The number of blocks that the selected worker may already have cached.`
`38`	`38`	`/// This is not a guarantee, but an estimate.`
`39`		`- pub overlap_blocks: usize,`
	`39`	`+ pub overlap_blocks: u32,`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]`