ai-dynamo
diff --git a/‎docs/architecture/kv_cache_routing.md‎
Lines changed: 66 additions & 0 deletions b/‎docs/architecture/kv_cache_routing.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎lib/bindings/python/rust/llm/kv.rs‎
Lines changed: 66 additions & 5 deletions b/‎lib/bindings/python/rust/llm/kv.rs‎
Lines changed: 66 additions & 5 deletions
diff --git a/‎lib/bindings/python/src/dynamo/_core.pyi‎
Lines changed: 48 additions & 0 deletions b/‎lib/bindings/python/src/dynamo/_core.pyi‎
Lines changed: 48 additions & 0 deletions
@@ -292,4 +292,70 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
 
+### Additional Routing Features
+
+The `KvPushRouter` provides additional methods for fine-grained control:
+
+- **`best_worker_id()`**: Query which worker would be selected for given tokens without actually routing the request. Returns `(worker_id, overlap_blocks)`.
+- **`get_potential_loads()`**: Get detailed load information for all workers including potential prefill tokens and active decode blocks.
+- **`worker_id` parameter in `generate()`**: Force routing to a specific worker by passing `worker_id=<id>` to bypass the automatic KV-aware selection.
+
 The `router_config_override` parameter allows you to adjust routing behavior per request without recreating the router. This is useful for implementing different routing strategies based on request characteristics.
+
+### Custom Routing Example: Minimizing TTFT
+
+Here's an example of using `get_potential_loads()` to implement custom routing that minimizes Time To First Token (TTFT) by selecting the worker with the least prefill work:
+
+```python
+import asyncio
+from dynamo._core import DistributedRuntime, KvPushRouter, KvRouterConfig
+
+async def minimize_ttft_routing():
+    # Setup router
+    runtime = DistributedRuntime.detached()
+    namespace = runtime.namespace("inference")
+    component = namespace.component("vllm")
+    endpoint = component.endpoint("generate")
+
+    router = KvPushRouter(
+        endpoint=endpoint,
+        block_size=16,
+        kv_router_config=KvRouterConfig()
+    )
+
+    # Your input tokens
+    token_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    # Get potential loads for all workers
+    potential_loads = await router.get_potential_loads(token_ids)
+
+    # Find worker with minimum prefill tokens (best for TTFT)
+    best_worker = min(potential_loads, key=lambda x: x['potential_prefill_tokens'])
+
+    print(f"Worker loads: {potential_loads}")
+    print(f"Selected worker {best_worker['worker_id']} with {best_worker['potential_prefill_tokens']} prefill tokens")
+
+    # Route directly to the selected worker
+    stream = await router.generate(
+        token_ids=token_ids,
+        model="meta-llama/Llama-2-7b-hf",
+        worker_id=best_worker['worker_id'],  # Force routing to optimal worker
+        stop_conditions={"max_tokens": 20}
+    )
+
+    # Process response
+    async for response in stream:
+        if isinstance(response, dict) and "token_ids" in response:
+            print(f"Generated tokens: {response['token_ids']}")
+
+if __name__ == "__main__":
+    asyncio.run(minimize_ttft_routing())
+```
+
+This approach gives you complete control over routing decisions, allowing you to optimize for different metrics based on your specific requirements. As some examples:
+
+- **Minimize TTFT**: Select worker with lowest `potential_prefill_tokens`
+- **Maximize cache reuse**: Use `best_worker_id()` which considers both prefill and decode loads
+- **Balance load**: Consider both `potential_prefill_tokens` and `potential_decode_blocks` together
+
+See [KV Router Architecture](../components/router/README.md) for performance tuning details.
@@ -909,7 +909,7 @@ impl KvPushRouter {
     }
 
     #[allow(clippy::too_many_arguments)]
-    #[pyo3(signature = (token_ids, model, stop_conditions=None, sampling_options=None, output_options=None, router_config_override=None))]
+    #[pyo3(signature = (token_ids, model, stop_conditions=None, sampling_options=None, output_options=None, router_config_override=None, worker_id=None))]
     fn generate<'p>(
         &self,
         py: Python<'p>,
@@ -919,6 +919,7 @@ impl KvPushRouter {
         sampling_options: Option<PyObject>,
         output_options: Option<PyObject>,
         router_config_override: Option<PyObject>,
+        worker_id: Option<i64>,
     ) -> PyResult<Bound<'p, PyAny>> {
         // Depythonize the options with defaults
         let (stop_conditions, sampling_options, output_options, router_config_override) =
@@ -957,15 +958,22 @@ impl KvPushRouter {
             })?;
 
         // Build the PreprocessedRequest
-        let request = llm_rs::protocols::common::preprocessor::PreprocessedRequest::builder()
+        let mut request_builder =
+            llm_rs::protocols::common::preprocessor::PreprocessedRequest::builder();
+        request_builder
             .model(model)
             .token_ids(token_ids)
             .stop_conditions(stop_conditions)
             .sampling_options(sampling_options)
             .output_options(output_options)
-            .router_config_override(router_config_override)
-            .build()
-            .map_err(to_pyerr)?;
+            .router_config_override(router_config_override);
+
+        // Set backend_instance_id if worker_id is provided
+        if let Some(worker_id) = worker_id {
+            request_builder.backend_instance_id(Some(worker_id));
+        }
+
+        let request = request_builder.build().map_err(to_pyerr)?;
 
         let inner = self.inner.clone();
 
@@ -1010,6 +1018,59 @@ impl KvPushRouter {
         })
     }
 
+    #[pyo3(signature = (context_id, token_ids, router_config_override=None))]
+    fn best_worker_id<'p>(
+        &self,
+        py: Python<'p>,
+        context_id: String,
+        token_ids: Vec<u32>,
+        router_config_override: Option<PyObject>,
+    ) -> PyResult<Bound<'p, PyAny>> {
+        let router_config_override = if let Some(obj) = router_config_override {
+            Python::with_gil(|py| {
+                let override_config: llm_rs::kv_router::RouterConfigOverride =
+                    depythonize(obj.bind(py)).map_err(to_pyerr)?;
+                Ok::<_, PyErr>(Some(override_config))
+            })?
+        } else {
+            None
+        };
+
+        let inner = self.inner.clone();
+
+        pyo3_async_runtimes::tokio::future_into_py(py, async move {
+            let (worker_id, overlap_blocks) = inner
+                .find_best_match(&context_id, &token_ids, router_config_override.as_ref())
+                .await
+                .map_err(to_pyerr)?;
+
+            // Return a tuple of (worker_id, overlap_blocks)
+            Ok((worker_id, overlap_blocks))
+        })
+    }
+
+    fn get_potential_loads<'p>(
+        &self,
+        py: Python<'p>,
+        token_ids: Vec<u32>,
+    ) -> PyResult<Bound<'p, PyAny>> {
+        let inner = self.inner.clone();
+
+        pyo3_async_runtimes::tokio::future_into_py(py, async move {
+            let loads = inner
+                .get_potential_loads(&token_ids)
+                .await
+                .map_err(to_pyerr)?;
+
+            // Use pythonize to convert Vec<PotentialLoad> to Python list of dicts
+            Python::with_gil(|py| {
+                pythonize(py, &loads)
+                    .map(|obj| obj.unbind())
+                    .map_err(to_pyerr)
+            })
+        })
+    }
+
     /// Dump all events from the KV router's indexer as a JSON string
     fn dump_events<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyAny>> {
         let inner = self.inner.clone();
 
@@ -1227,6 +1227,7 @@ class KvPushRouter:
         sampling_options: Optional[JsonLike] = None,
         output_options: Optional[JsonLike] = None,
         router_config_override: Optional[JsonLike] = None,
+        worker_id: Optional[int] = None,
     ) -> AsyncIterator[JsonLike]:
         """
         Generate text using the KV-aware router.
@@ -1238,9 +1239,56 @@ class KvPushRouter:
             sampling_options: Optional sampling configuration
             output_options: Optional output configuration
             router_config_override: Optional router configuration override
+            worker_id: Optional worker ID to route to directly. If set, the request
+                      will be sent to this specific worker and router states will be
+                      updated accordingly.
 
         Returns:
             An async iterator yielding generation responses
+
+        Note:
+            - If worker_id is set, the request bypasses KV matching and routes directly
+              to the specified worker while still updating router states.
+            - This is different from query_instance_id which doesn't route the request.
+        """
+        ...
+
+    async def best_worker_id(
+        self,
+        context_id: str,
+        token_ids: List[int],
+        router_config_override: Optional[JsonLike] = None,
+    ) -> Tuple[int, int]:
+        """
+        Find the best matching worker for the given tokens without updating states.
+
+        Args:
+            context_id: String identifier for the request
+            token_ids: List of token IDs to find matches for
+            router_config_override: Optional router configuration override
+
+        Returns:
+            A tuple of (worker_id, overlap_blocks) where:
+                - worker_id: The ID of the best matching worker
+                - overlap_blocks: The number of overlapping blocks found
+        """
+        ...
+
+    async def get_potential_loads(
+        self,
+        token_ids: List[int],
+    ) -> List[Dict[str, int]]:
+        """
+        Get potential prefill and decode loads for all workers.
+
+        Args:
+            token_ids: List of token IDs to evaluate
+
+        Returns:
+            A list of dictionaries, each containing:
+                - worker_id: The worker ID
+                - potential_prefill_tokens: Number of tokens that would need prefill
+                - potential_decode_blocks: Number of blocks currently in decode phase
         """
         ...