[Docs] Docstrings for all new methods provided by EPD disagg update

LastZhabka · LastZhabka · commit 44a7c303d982 · 2025-08-28T21:37:00.000+08:00
Signed-off-by: LastZhabka &lt;sakhmoldin.mukhammadarif@gmail.com&gt;
diff --git a/vllm/separated_encode/ec_transfer/connector/redis.py b/vllm/separated_encode/ec_transfer/connector/redis.py
@@ -36,13 +36,33 @@ def __init__(self,
         )
     
     def _get_request_ranks(self, request_id: str):
-        # request_id format: $ACTUAL_REQUEST_ID|$E_RANK|$PD_RANK
+        """Extract E_RANK and PD_RANK from a proxy-formatted request ID.
+    
+        Extracts the request_id with format $ACTUAL_REQUEST_ID|$E_RANK|$PD_RANK
+        
+        Args:
+            request_id: The formatted request ID string from the proxy.
+            
+        Returns:
+            Tuple containing (E_RANK, PD_RANK).
+        """
         result = request_id.split("|")
-        return int(result[1]), int(result[2])
+        return int(result[-2]), int(result[-1])
 
     def _send_prealloc_notification(self, request_id: str, input_id: int, 
                                     successful: bool, mm_hash: str) -> None:
-        # PD -> E
+        """
+        Send pre-allocation notification from PD to E instance via Redis.
+
+        Notifies the encoder instance whether pre-allocation was successful
+        and whether the encoder cache should be sent.
+
+        Args:
+            request_id: The formatted request ID containing rank information.
+            input_id: Index of the multimodal input within the request.
+            successful: Whether pre-allocation succeeded and cache should be sent.
+            mm_hash: Hash of the multimodal input.
+        """
         transfer_data = {
             "request_id": request_id, 
             "input_id": input_id, 
@@ -58,7 +78,18 @@ def _send_encoder_cache_metas(
         self, request_id: str, input_id: int,
         num_encoder_tokens: int, mm_hash: str
     ) -> None:
-        # E -> PD
+        """
+        Send encoder cache metadata from E to PD instance via Redis.
+        
+        Transfers metadata needed for pre-allocating space for the encoder cache
+        on the prefill/decode instance.
+        
+        Args:
+            request_id: The formatted request ID containing rank information.
+            input_id: Index of the multimodal input within the request.
+            num_encoder_tokens: Number of tokens in the encoder cache.
+            mm_hash: Hash of the multimodal input.
+        """
         transfer_data = {
             "request_id": request_id,
             "input_id": input_id,
@@ -73,7 +104,18 @@ def _send_encoder_cache_metas(
     def _send_encoder_cache(
         self, request_id: str, input_id: int,
         encoder_cache: torch.Tensor, mm_hash: str) -> None:
-        # E -> PD
+        """
+        Send encoder cache tensor from E to PD instance via Redis.
+        
+        Converts the encoder cache to CPU float16 numpy array before sending
+        to optimize transfer size.
+        
+        Args:
+            request_id: The formatted request ID containing rank information.
+            input_id: Index of the multimodal input within the request.
+            encoder_cache: The encoder output tensor to transfer.
+            mm_hash: Hash of the multimodal input.
+        """
         encoder_cache_numpy = encoder_cache.to("cpu", dtype=torch.float16).numpy()
         transfer_data = msgpack_numpy.packb({
             "request_id": request_id,
@@ -88,6 +130,16 @@ def _send_encoder_cache(
     def _recv_prealloc_notification(
             self, maybe_send_cache_callback: Callable[[str, int, bool, str],
                                                       None]) -> None:
+        """
+        Receive pre-allocation notification on E instance from Redis.
+        
+        Blocks until a notification is received, then unpacks the data and
+        invokes the callback to handle cache sending logic.
+        
+        Args:
+            maybe_send_cache_callback: Callback to determine whether to send
+                the encoder cache based on the pre-allocation result.
+        """
         transfered_data = self.redis_client.blpop(f"prealloc{self.rank}")[1]
         transfered_data = msgpack_numpy.unpackb(transfered_data, raw=False)
         request_id, input_id, successful, mm_hash = (
@@ -102,6 +154,16 @@ def _recv_prealloc_notification(
     def _recv_encoder_cache_metas(
             self, preallocate_callback: Callable[[str, int, int, str],
                                                  None]) -> None:
+        """
+        Receive encoder cache metadata on PD instance from Redis.
+        
+        Blocks until metadata is received, then unpacks the data and invokes
+        the callback to pre-allocate space in the scheduler.
+        
+        Args:
+            preallocate_callback: Scheduler callback to pre-allocate space
+                for the incoming encoder cache.
+        """
         transfered_data = self.redis_client.blpop(f"cache_metas{self.rank}")[1]
         transfered_data = msgpack_numpy.unpackb(transfered_data, raw=False)
         request_id, input_id, num_encoder_tokens, mm_hash = (
@@ -117,6 +179,16 @@ def _recv_encoder_cache(
         self, 
         injection_callback: Callable[[str, int, torch.Tensor, str],None]
     ) -> None:
+        """
+        Receive encoder cache tensor on PD instance from Redis.
+        
+        Blocks until cache data is received, converts it from numpy back to
+        the appropriate torch tensor format, then invokes the injection callback.
+        
+        Args:
+            injection_callback: Model runner callback to inject the encoder
+                cache into the cache dictionary.
+        """
         transfered_data = self.redis_client.blpop(f"cache{self.rank}")[1]
         transfered_data = msgpack_numpy.unpackb(transfered_data, raw=False)
         request_id, input_id, encoder_cache, mm_hash = (
diff --git a/vllm/separated_encode/ec_transfer/connector/template.py b/vllm/separated_encode/ec_transfer/connector/template.py
@@ -155,7 +155,7 @@ def _send_encoder_cache(
         Args:
             request_id: id of the encoder cache's request.
             input_id: index of the mm input amoung request's mm inputs
-            encoder_cache: cache produced by vision model, in np array form
+            encoder_cache: encoder output
             mm_hash: hash of the mm input
         """
         pass
@@ -371,7 +371,7 @@ def schedule_send_encoder_cache(
         Args:
             request_id: id of the encoder cache's request.
             input_id: index of the mm input amoung request's mm inputs
-            encoder_cache: cache produced by vision model, in np array form
+            encoder_cache: encoder output 
         """
         self.send_tasks_queue.put_nowait(
             (self._finish_wrapper, (self._send_encoder_cache, request_id,
@@ -381,16 +381,18 @@ def _finish_wrapper(
         self, callback: Callable, request_id: str, input_id: int,
         encoder_cache: torch.Tensor, mm_hash: str
     ):
-
+        """
+        Wrapper to fill the transfered_ids list
+        """
         callback(request_id, input_id, encoder_cache, mm_hash)
         with self.transfered_ids_lock:
             self.transfered_ids.append((request_id, input_id))
 
     def get_transfered_ids(self, ):
+        """
+        Method to get transfered ids
+        """
         with self.transfered_ids_lock:
             transfered_ids = self.transfered_ids
             self.transfered_ids = []
-            return transfered_ids
-
-    def finish_request(self, req_id):
-        pass
+            return transfered_ids
diff --git a/vllm/separated_encode/sched/encoder_scheduler.py b/vllm/separated_encode/sched/encoder_scheduler.py
@@ -226,6 +226,8 @@ def update_from_output(
         scheduler_output: SchedulerOutput,
         model_runner_output: ModelRunnerOutput,
     ) -> dict[int, EngineCoreOutputs]:            
+        
+        # clean up the logic space of mm_data that was transfered
         transfered_mm_data = model_runner_output.transfered_mm_data
 
         for (req_id, input_id) in transfered_mm_data:
@@ -241,6 +243,7 @@ def update_from_output(
 
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
 
+        # stop all requests from the current batch
         model_finished = []
         for request in self.running:
             req_id = request.request_id
diff --git a/vllm/separated_encode/worker/gpu_epd_lm_wrapper.py b/vllm/separated_encode/worker/gpu_epd_lm_wrapper.py
@@ -91,15 +91,19 @@ def execute_model(
         model_runner_output.injected_mm_data = injected_encoder_cache_ids
         return model_runner_output
 
-    def receive_encoder_cache(self, request_id, input_id, encoder_cache, mm_hash):
+    def receive_encoder_cache(
+        self, 
+        request_id: str, 
+        input_id: int, 
+        encoder_cache: torch.Tensor, 
+        mm_hash: str
+    ):
         """
         Callback function for receiving encoder cache from remote instances.
         
         This method is invoked by the encoder cache connector when encoder
-        cache data is received from remote encoder instances. It processes
-        the received numpy array by converting it to a PyTorch tensor with
-        the correct device placement and data type, then stores it in the
-        local encoder cache dictionary.
+        cache data is received from remote encoder instances, then It stores 
+        received tensor in the local encoder_cache dictionary.
         
         The method updates the injected encoder cache IDs list to inform the
         scheduler about successful cache injections.
diff --git a/vllm/separated_encode/worker/gpu_epd_vm_wrapper.py b/vllm/separated_encode/worker/gpu_epd_vm_wrapper.py
@@ -110,9 +110,6 @@ def execute_model(
         inputs, and transferring computed encoder caches to remote instances 
         via a connector, while providing transfer status information to the 
         scheduler. 
-        
-        It also converts encoder outputs into CPU tensors and then to numpy 
-        arrays to prepare data for the transfer.        
         """
         self._update_states(scheduler_output)
         self._execute_mm_encoder(scheduler_output)