ai-dynamo
diff --git a/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 8 additions & 0 deletions b/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/bindings/python/rust/llm/entrypoint.rs‎
Lines changed: 3 additions & 1 deletion b/‎lib/bindings/python/rust/llm/entrypoint.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lib/llm/src/kv_router.rs‎
Lines changed: 39 additions & 8 deletions b/‎lib/llm/src/kv_router.rs‎
Lines changed: 39 additions & 8 deletions
@@ -99,6 +99,13 @@ pub struct Flags {
     #[arg(long)]
     pub router_replica_sync: Option<bool>,
 
+    /// KV Router: Whether to track active blocks in the router for memory management.
+    /// When false, the router will not maintain state about which blocks are active,
+    /// reducing memory overhead but potentially affecting scheduling decisions.
+    /// Default: true
+    #[arg(long)]
+    pub router_track_active_blocks: Option<bool>,
+
     /// Max model context length. Reduce this if you don't have enough VRAM for the full model
     /// context length (e.g. Llama 4).
     /// Defaults to the model's max, which is usually model_max_length in tokenizer_config.json.
@@ -228,6 +235,7 @@ impl Flags {
                 self.router_temperature,
                 self.use_kv_events,
                 self.router_replica_sync,
+                self.router_track_active_blocks,
                 self.max_num_batched_tokens,
                 // defaulting below args (no longer maintaining new flags for dynamo-run)
                 None,
 
@@ -42,12 +42,13 @@ impl KvRouterConfig {
 #[pymethods]
 impl KvRouterConfig {
     #[new]
-    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false, router_snapshot_threshold=10000, router_reset_states=false))]
+    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false, router_track_active_blocks=true, router_snapshot_threshold=10000, router_reset_states=false))]
     fn new(
         overlap_score_weight: f64,
         router_temperature: f64,
         use_kv_events: bool,
         router_replica_sync: bool,
+        router_track_active_blocks: bool,
         router_snapshot_threshold: Option<u32>,
         router_reset_states: bool,
     ) -> Self {
@@ -57,6 +58,7 @@ impl KvRouterConfig {
                 router_temperature,
                 use_kv_events,
                 router_replica_sync,
+                router_track_active_blocks,
                 router_snapshot_threshold,
                 router_reset_states,
                 ..Default::default()
 
@@ -23,7 +23,6 @@ use serde::{Deserialize, Serialize};
 pub mod approx;
 pub mod indexer;
 pub mod metrics_aggregator;
-pub mod prefill_counter;
 pub mod protocols;
 pub mod publisher;
 pub mod recorder;
@@ -102,6 +101,9 @@ pub struct KvRouterConfig {
 
     pub router_replica_sync: bool,
 
+    /// Whether to track active blocks in the router (default: true)
+    pub router_track_active_blocks: bool,
+
     // TODO: this is not actually used for now
     // Would need this (along with total kv blocks) to trigger AllWorkersBusy error for e.g. rate-limiting
     pub max_num_batched_tokens: u32,
@@ -120,6 +122,7 @@ impl Default for KvRouterConfig {
             router_temperature: 0.0,
             use_kv_events: true,
             router_replica_sync: false,
+            router_track_active_blocks: true,
             max_num_batched_tokens: 8192,
             router_snapshot_threshold: Some(10000),
             router_reset_states: false,
@@ -130,11 +133,13 @@ impl Default for KvRouterConfig {
 impl KvRouterConfig {
     /// Create a new KvRouterConfig with optional weight values.
     /// If a weight is None, the default value will be used.
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         overlap_score_weight: Option<f64>,
         temperature: Option<f64>,
         use_kv_events: Option<bool>,
         replica_sync: Option<bool>,
+        track_active_blocks: Option<bool>,
         max_num_batched_tokens: Option<u32>,
         router_snapshot_threshold: Option<Option<u32>>,
         router_reset_states: Option<bool>,
@@ -145,6 +150,8 @@ impl KvRouterConfig {
             router_temperature: temperature.unwrap_or(default.router_temperature),
             use_kv_events: use_kv_events.unwrap_or(default.use_kv_events),
             router_replica_sync: replica_sync.unwrap_or(default.router_replica_sync),
+            router_track_active_blocks: track_active_blocks
+                .unwrap_or(default.router_track_active_blocks),
             max_num_batched_tokens: max_num_batched_tokens
                 .unwrap_or(default.max_num_batched_tokens),
             router_snapshot_threshold: router_snapshot_threshold
@@ -189,6 +196,8 @@ pub struct KvRouter {
     scheduler: KvScheduler,
 
     block_size: u32,
+
+    kv_router_config: KvRouterConfig,
 }
 
 impl KvRouter {
@@ -257,6 +266,7 @@ impl KvRouter {
             runtime_configs_rx,
             selector,
             kv_router_config.router_replica_sync,
+            consumer_uuid.clone(),
         )
         .await?;
 
@@ -282,6 +292,7 @@ impl KvRouter {
             indexer,
             scheduler,
             block_size,
+            kv_router_config,
         })
     }
 
@@ -302,12 +313,25 @@ impl KvRouter {
 
         let overlap_scores = self.indexer.find_matches(block_hashes.clone()).await?;
 
+        // Determine who needs seq_hashes
+        let approx_indexer_needs_it = matches!(self.indexer, Indexer::ApproxKvIndexer(_));
+        let scheduler_needs_it = self.kv_router_config.router_track_active_blocks;
+
+        // Optimize cloning: only clone if both need it, otherwise move
+        let (maybe_seq_hashes_1, maybe_seq_hashes_2) =
+            match (approx_indexer_needs_it, scheduler_needs_it) {
+                (true, true) => (Some(seq_hashes.clone()), Some(seq_hashes)),
+                (true, false) => (Some(seq_hashes), None),
+                (false, true) => (None, Some(seq_hashes)),
+                (false, false) => (None, None),
+            };
+
         let best_worker_id = self
             .scheduler
             .schedule(
                 context_id.to_string(),
                 isl_tokens,
-                seq_hashes.clone(),
+                maybe_seq_hashes_2,
                 overlap_scores.clone(),
                 router_config_override,
                 update_states,
@@ -316,7 +340,7 @@ impl KvRouter {
 
         if let Indexer::ApproxKvIndexer(ref indexer) = self.indexer {
             indexer
-                .process_routing_decision(best_worker_id, block_hashes, seq_hashes)
+                .process_routing_decision(best_worker_id, block_hashes, maybe_seq_hashes_1.unwrap())
                 .await
                 .unwrap();
         };
@@ -337,13 +361,16 @@ impl KvRouter {
         worker_id: i64,
     ) {
         let isl_tokens = tokens.len();
-        let block_hashes = compute_block_hash_for_seq(tokens, self.block_size);
-        let seq_hashes = compute_seq_hash_for_block(&block_hashes);
+
+        let maybe_seq_hashes = self.kv_router_config.router_track_active_blocks.then(|| {
+            let block_hashes = compute_block_hash_for_seq(tokens, self.block_size);
+            compute_seq_hash_for_block(&block_hashes)
+        });
 
         self.scheduler
             .add_request(
                 request_id,
-                seq_hashes,
+                maybe_seq_hashes,
                 isl_tokens,
                 overlap_blocks,
                 worker_id,
@@ -367,12 +394,16 @@ impl KvRouter {
     pub async fn get_potential_loads(&self, tokens: &[u32]) -> Result<Vec<PotentialLoad>> {
         let isl_tokens = tokens.len();
         let block_hashes = compute_block_hash_for_seq(tokens, self.block_size);
-        let seq_hashes = compute_seq_hash_for_block(&block_hashes);
         let overlap_scores = self.indexer.find_matches(block_hashes).await?;
 
+        let maybe_seq_hashes = self.kv_router_config.router_track_active_blocks.then(|| {
+            let block_hashes = compute_block_hash_for_seq(tokens, self.block_size);
+            compute_seq_hash_for_block(&block_hashes)
+        });
+
         Ok(self
             .scheduler
-            .get_potential_loads(seq_hashes, isl_tokens, overlap_scores)
+            .get_potential_loads(maybe_seq_hashes, isl_tokens, overlap_scores)
             .await)
     }