[Fix] Properly handle cached mm_inputs on Encode instance

LastZhabka · LastZhabka · commit ed61cc73fe6b · 2025-08-28T21:37:01.000+08:00
Signed-off-by: LastZhabka &lt;sakhmoldin.mukhammadarif@gmail.com&gt;
diff --git a/vllm/separated_encode/README.md b/vllm/separated_encode/README.md
@@ -220,7 +220,9 @@ Separate EncoderScheduler class implementation is provided for encode instance s
 
 The EncoderScheduler is a specialized scheduler for encode instances that focuses on only multimodal input scheduling. It maintains an `_allocated` dictionary to track allocated encoder cache entries, their sizes and hashes. This dictionary is used to allow us to free up logical space without storing the request itself, which enables us to end the request before the data is transferred.
 
-Currently the encode scheduler schedules all multimodal inputs for a request at once in the `schedule()` method. It checks if there's sufficient encoder cache space and budget before allocating all inputs together. A request on the encode instance is considered finished when all its multimodal embeddings have been computed, so all requests are finished in 1 iteration after scheduling, transfer is handled separately in encoder cache connectors, space allocated for encoder cache is deallocated only after transfers, not after request finish.
+Currently the encode scheduler schedules all multimodal inputs for a request at once in the `schedule()` method. It checks if there's sufficient encoder cache space and budget before allocating all inputs together. Note that input is already cached we will still add it into the `scheduled_encoder_inputs`, but we will not allocate space for it and on model runner we will skip the encoder execution for such elements, we need to do that because in `model_runner` the signal needs to be sent to `ECConnector` from each `mm_input`.  
+
+A request on the encode instance is considered finished when all its multimodal embeddings have been computed, so all requests are finished in 1 iteration after scheduling, transfer is handled separately in encoder cache connectors, space allocated for encoder cache is deallocated only after transfers, not after request finish.
 
 In the `update_from_output()` method, the scheduler goes through transferred multimodal data IDs and frees the mm inputs in encoder cache manager.
 
@@ -250,7 +252,9 @@ This wrapper runs on encode instances and processes multimodal inputs. It execut
 
 The encode instance doesn't need KV cache since it only runs vision part of MLLM. The wrapper overrides `initialize_kv_cache_tensors` and `initialize_kv_cache` to return empty results, freeing up GPU memory for larger encoder cache storage.
 
-During execution, the wrapper executes encoding for scheduled multimodal inputs and inserts enocder output in encoder cache connector. Since no text generation happens here, it returns empty ModelRunnerOutput with additional transfered encoder outputs information in ModelRunnerOutput, this information is used in encoder scheduler to free the space in encoder cache manager.
+During execution, the wrapper executes encoding for scheduled multimodal inputs and inserts encoder output in encoder cache connector, due to nature of encode scheduler the `scheduled_output.scheduled_encoder_inputs` can contain already cached inputs or multiple same multimodal inputs, as cache is already present or going to be present we can just skip the encoding process for such `mm_inputs`.So we temporarily remove cached inputs and inputs such that their `mm_hash` already present somewhere in `scheduled_encoder_inputs`, after execution we return all removed entries back to `scheduler_output`. Motivation for sending all multimodal inputs to `model_runner` is provided in `EncoderScheduler` section.  
+
+Since no text generation happens here, it returns almost empty ModelRunnerOutput with additional transfered encoder outputs information in ModelRunnerOutput, this information is used in encoder scheduler to free the space in encoder cache manager.
 
 #### DisaggPrefillDecodeGPURunnerWrapper (Prefill/(Prefill+Decode) Instance)
 
diff --git a/vllm/separated_encode/sched/encoder_scheduler.py b/vllm/separated_encode/sched/encoder_scheduler.py
@@ -135,7 +135,9 @@ def schedule(self) -> SchedulerOutput:
             
             num_tokens_to_schedule = 0
             can_allocate_all = True
-            encoder_inputs_to_schedule = []         
+            encoder_inputs_to_schedule = []
+            is_cached = []
+            
             for input_id, pos_info in enumerate(mm_positions):
                 num_encoder_tokens = pos_info.length
                 if (
@@ -144,6 +146,13 @@ def schedule(self) -> SchedulerOutput:
                         request, input_id
                     )
                 ):
+                    # On Encoder instance we need to send all inputs to model runner
+                    # because we need to pass (req_id, input_id) to model runner's
+                    # ec connector, to send the cache to PD instance, so we will add
+                    # it to the scheduled encoder inputs without changing budget
+                    # and in model runner we will just skip all calculated values
+                    encoder_inputs_to_schedule.append(input_id)
+                    is_cached.append(True)
                     continue
                 if not self.encoder_cache_manager.can_allocate(
                     request=request, 
@@ -156,6 +165,7 @@ def schedule(self) -> SchedulerOutput:
                 num_tokens_to_schedule += num_encoder_tokens
                 new_encoder_compute_budget -= num_encoder_tokens
                 encoder_inputs_to_schedule.append(input_id)
+                is_cached.append(False)
             
             # NOTE: Note that all updates from loop above are not applied 
             # if we can't allocate all mm_inputs    
@@ -179,10 +189,11 @@ def schedule(self) -> SchedulerOutput:
             scheduled_encoder_inputs[req_id] = encoder_inputs_to_schedule
             
             # Allocate the encoder cache.
-            for input_id in encoder_inputs_to_schedule:
+            for input_id, is_cached_input in zip(encoder_inputs_to_schedule, is_cached):
                 mm_hash = request.mm_hashes[input_id]
                 num_encoder_tokens = request.get_num_encoder_tokens(input_id)
-                self.encoder_cache_manager.allocate(request, input_id)                
+                if not is_cached_input:
+                    self.encoder_cache_manager.allocate(request, input_id)               
                 self.ec_connector.schedule_send_encoder_cache_metadata(
                     req_id,
                     input_id,
@@ -216,7 +227,6 @@ def schedule(self) -> SchedulerOutput:
             structured_output_request_ids={},
             grammar_bitmask=None,
         )
-        logger.debug(f"Request (8) ")    
 
         self.finished_req_ids = set()
         return scheduler_output
diff --git a/vllm/separated_encode/worker/gpu_epd_vm_wrapper.py b/vllm/separated_encode/worker/gpu_epd_vm_wrapper.py
@@ -112,8 +112,30 @@ def execute_model(
         scheduler. 
         """
         self._update_states(scheduler_output)
+        old_scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        new_scheduled_encoder_inputs = {}
+
+        # Erase cached inputs to execute mm encoder without repeated cache inputs
+        going_to_be_executed = set()
+        for req_id, mm_input_ids in old_scheduled_encoder_inputs.items():
+            mm_hashes = self.requests[req_id].mm_hashes
+            uncached_inputs = []
+            for input_id in mm_input_ids:
+                mm_hash = mm_hashes[input_id]
+                if ((not mm_hash in self.encoder_cache) 
+                    and (mm_hash not in going_to_be_executed)):
+                    uncached_inputs.append(input_id)
+                    going_to_be_executed.add(mm_hash)
+            new_scheduled_encoder_inputs[req_id] = uncached_inputs
+            
+        scheduler_output.scheduled_encoder_inputs = new_scheduled_encoder_inputs
+        
         self._execute_mm_encoder(scheduler_output)
-        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+
+        scheduler_output.scheduled_encoder_inputs = old_scheduled_encoder_inputs
+        del new_scheduled_encoder_inputs
+
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs 
 
         for req_id, mm_input_ids in scheduled_encoder_inputs.items():
             mm_hashes = self.requests[req_id].mm_hashes