Refactor the token-wise padding mechanism to a more elegant implementation, correcting the padding logic errors introduced by the previous multimodal commit.

yiz-liu · yiz-liu · commit 28326676e6d0 · 2025-06-17T20:04:10.000+08:00
Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -945,10 +945,7 @@ def _process_reqs(
         # Copy the tensors to the NPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-        input_ids = self.input_ids[:num_input_tokens]
 
-        # prepare the MRoPE for mllm if using multimodal
-        num_input_tokens = total_num_scheduled_tokens
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
         if self.is_multimodal_model:
@@ -962,27 +959,26 @@ def _process_reqs(
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            input_ids = self.input_ids[:num_input_tokens]
+            input_ids = self.input_ids[:total_num_scheduled_tokens]
             if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
                     input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
+            self.inputs_embeds[:total_num_scheduled_tokens].copy_(
+                inputs_embeds)
             inputs_embeds = self.inputs_embeds[:num_input_tokens]
             input_ids = None
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
             # multimodal models, it is not desirable for performance since
-            # then the embedding layer is not included in the CUDA graph.
+            # then the embedding layer is not included in the ACL Graph.
             input_ids = self.input_ids[:num_input_tokens]
             inputs_embeds = None
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
-        else:
-            positions = self.positions[:num_input_tokens]
 
         if (envs_ascend.VLLM_ENABLE_MC2
                 or self.torchair_graph_enabled) and not with_prefill: