some small fixes

richardhuo-nv · richardhuo-nv · commit 0aad1853c6d6 · 2025-08-15T17:48:57.000-07:00
diff --git a/lib/bindings/python/rust/llm/block_manager/distributed/leader.rs b/lib/bindings/python/rust/llm/block_manager/distributed/leader.rs
@@ -67,8 +67,8 @@ impl KvbmLeader {
 #[pymethods]
 impl KvbmLeader {
     #[new]
-    #[pyo3(signature = (bytes_per_block, world_size, drt))]
-    fn new(bytes_per_block: usize, world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
+    #[pyo3(signature = (world_size, drt))]
+    fn new(world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
 
         let barrier_id_prefix = get_barrier_id_prefix();
         let leader_init_timeout_sec: u64 =
@@ -81,7 +81,6 @@ impl KvbmLeader {
             .drt(drt.inner().clone())
             .host_blocks_config(get_blocks_config(CPU_CACHE, CPU_CACHE_OVERRIDE))
             .disk_blocks_config(get_blocks_config(DISK_CACHE, DISK_CACHE_OVERRIDE))
-            .bytes_per_block_overriden(bytes_per_block)
             .build()
             .map_err(to_pyerr)?;
 
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
@@ -55,7 +55,7 @@ pub struct KvConnectorLeader {
 
 impl KvConnectorLeader {
     fn new(
-        worker_id: String,
+        worker_id: u64,
         drt: PyDistributedRuntime,
         block_manager: PyBlockManager,
         leader: PyKvbmLeader,
@@ -374,7 +374,7 @@ impl PyTrtllmKvConnectorLeader {
     #[new]
     #[pyo3(signature = (worker_id, drt, block_manager, leader))]
     pub fn new(
-        worker_id: String,
+        worker_id: u64,
         drt: PyDistributedRuntime,
         block_manager: PyBlockManager,
         leader: PyKvbmLeader,
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_worker.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_worker.rs
@@ -403,7 +403,7 @@ impl PyTrtllmKvConnectorWorker {
             .map_err(to_pyerr)
     }
 
-    pub fn build_connector_meta(&mut self, metadata: Vec<u8>) -> PyResult<()> {
+    pub fn bind_connector_meta(&mut self, metadata: Vec<u8>) -> PyResult<()> {
         self.connector_worker
             .bind_connector_meta(metadata)
             .map_err(to_pyerr)
diff --git a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py
@@ -25,11 +25,11 @@ def __init__(self, executor_config: ExecutorConfig):
         super().__init__(executor_config)
         self.drt = DistributedRuntime.detached()
 
-        world_size = self._config.world_size
+        world_size = self._config.mapping.world_size
         self.block_size = self._config.tokens_per_block
 
         # Set bytes_per_block to 0, because we will retrieve the actual value from the worker side.
-        leader = KvbmLeader(0, world_size, drt=self.drt)
+        leader = KvbmLeader(world_size, drt=self.drt)
 
         block_manager = BlockManager(
             0,
@@ -58,7 +58,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
 
         for req in scheduler_output.new_requests:
             output.add_new_request(
-                req.request_id,
+                str(req.request_id),
                 req.new_tokens,
                 req.new_block_ids,
                 req.computed_position,
@@ -67,7 +67,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
         resumed_from_preemption = False
         for req in scheduler_output.cached_requests:
             output.add_cached_request(
-                req.request_id,
+                str(req.request_id),
                 resumed_from_preemption,
                 req.new_tokens,
                 req.new_block_ids,
@@ -91,7 +91,7 @@ def get_num_new_matched_tokens(
         """
         self._create_slot(request)
         return self._connector.get_num_new_matched_tokens(
-            request.request_id,
+            str(request.request_id),
             len(request.get_tokens(0)),
             num_computed_tokens,
         )
@@ -103,7 +103,7 @@ def update_state_after_alloc(self, request: LlmRequest, block_ids: List[int]):
             request: The request that was allocated resources.
             block_ids: The KV cacheblock IDs that were allocated.
         """
-        self._connector.update_state_after_alloc(request.request_id, block_ids)
+        self._connector.update_state_after_alloc(str(request.request_id), block_ids)
 
     def request_finished(self, request: LlmRequest, cache_block_ids: list[int]) -> bool:
         """
@@ -115,14 +115,14 @@ def request_finished(self, request: LlmRequest, cache_block_ids: list[int]) -> b
             If true, this indicates that the kv cache manager should wait to deallocate the blocks until the saving has completed (determined by `get_finished` on the workers).
         """
         is_async_saving = self._connector.request_finished(
-            request.request_id, cache_block_ids
+            str(request.request_id), cache_block_ids
         )
         return is_async_saving
 
     def _create_slot(self, request: LlmRequest) -> None:
         """Create a slot for the request"""
 
-        if self._connector.has_slot(request.request_id):
+        if self._connector.has_slot(str(request.request_id)):
             return None
 
         if bool(request.multimodal_positions):
@@ -131,8 +131,8 @@ def _create_slot(self, request: LlmRequest) -> None:
         all_token_ids = request.get_tokens(0)
 
         # extract the critial aspects of the request that effect how the tokens are hashed
-        request = KvbmRequest(
-            request_id=request.request_id, lora_name=None, salt_hash=None
+        kvbm_request = KvbmRequest(
+            request_id=str(request.request_id), lora_name=None, salt_hash=None
         )
 
-        self._connector.create_slot(request, all_token_ids)
+        self._connector.create_slot(kvbm_request, all_token_ids)
diff --git a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_worker.py b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_worker.py
@@ -13,16 +13,14 @@
 
 
 class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
-    def __init__(self, executor_config: ExecutorConfig, **kwargs):
+    def __init__(self, executor_config: ExecutorConfig):
         super().__init__(executor_config)
 
-        drt = kwargs.get("drt", None)
-        if drt is None:
-            self.drt = DistributedRuntime.detached()
-        else:
-            self.drt = drt
+        self.drt = DistributedRuntime.detached()
 
-        self._connector = RustKvConnectorWorker(self.drt, executor_config.mapping.rank)
+        self._connector = RustKvConnectorWorker(
+            self.drt, str(executor_config.mapping.rank)
+        )
 
     def register_kv_caches(self, kv_cache_tensor: torch.Tensor):
         """
diff --git a/lib/llm/src/block_manager/distributed/leader.rs b/lib/llm/src/block_manager/distributed/leader.rs
@@ -67,9 +67,6 @@ pub struct KvbmLeaderConfig {
 
     #[builder(default = "KvbmLeaderNumBlocksConfig::default()")]
     disk_blocks_config: KvbmLeaderNumBlocksConfig,
-
-    #[builder(default = "0")]
-    bytes_per_block_overriden: usize,
 }
 
 impl KvbmLeaderConfig {
@@ -137,23 +134,13 @@ impl KvbmLeader {
             .min()
             .unwrap();
 
-        let mut bytes_per_block = worker_data
+        let bytes_per_block = worker_data
             .values()
             .map(|data| data.bytes_per_block)
-            .max()
-            .unwrap();
+            .sum();
 
         assert!(bytes_per_block > 0, "bytes_per_block must be greater than 0");
 
-        // The NumBlocksConfig represents the overall assigned resources by the user,
-        // so we need to devide it by the world size to distribute the resources across all TPs.
-        bytes_per_block *= config.world_size;
-
-        // If bytes_per_block_overriden is greater than 0, it means the user has overridden this value.
-        if config.bytes_per_block_overriden > 0 {
-            bytes_per_block = config.bytes_per_block_overriden
-        }
-
         tracing::info!("Worker to leader barrier synced with {} workers", config.world_size);
         tracing::debug!("Worker data: {:?}", worker_data);
 

Original file line number	Diff line number	Diff line change
`@@ -403,7 +403,7 @@ impl PyTrtllmKvConnectorWorker {`
`403`	`403`	`.map_err(to_pyerr)`
`404`	`404`	`}`
`405`	`405`
`406`		`- pub fn build_connector_meta(&mut self, metadata: Vec<u8>) -> PyResult<()> {`
	`406`	`+ pub fn bind_connector_meta(&mut self, metadata: Vec<u8>) -> PyResult<()> {`
`407`	`407`	`self.connector_worker`
`408`	`408`	`.bind_connector_meta(metadata)`
`409`	`409`	`.map_err(to_pyerr)`