Skip to content

Commit 2832667

Browse files
committed
Refactor the token-wise padding mechanism to a more elegant implementation, correcting the padding logic errors introduced by the previous multimodal commit.
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
1 parent 223afe9 commit 2832667

File tree

1 file changed

+4
-8
lines changed

1 file changed

+4
-8
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -945,10 +945,7 @@ def _process_reqs(
945945
# Copy the tensors to the NPU.
946946
self.input_ids[:total_num_scheduled_tokens].copy_(
947947
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
948-
input_ids = self.input_ids[:num_input_tokens]
949948

950-
# prepare the MRoPE for mllm if using multimodal
951-
num_input_tokens = total_num_scheduled_tokens
952949
# _prepare_inputs may reorder the batch, so we must gather multi
953950
# modal outputs after that to ensure the correct order
954951
if self.is_multimodal_model:
@@ -962,27 +959,26 @@ def _process_reqs(
962959
# NOTE(woosuk): To unify token ids and soft tokens (vision
963960
# embeddings), we always use embeddings (rather than token ids)
964961
# as input to the multimodal model, even when the input is text.
965-
input_ids = self.input_ids[:num_input_tokens]
962+
input_ids = self.input_ids[:total_num_scheduled_tokens]
966963
if mm_embeds:
967964
inputs_embeds = self.model.get_input_embeddings(
968965
input_ids, mm_embeds)
969966
else:
970967
inputs_embeds = self.model.get_input_embeddings(input_ids)
971968
# TODO(woosuk): Avoid the copy. Optimize.
972-
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
969+
self.inputs_embeds[:total_num_scheduled_tokens].copy_(
970+
inputs_embeds)
973971
inputs_embeds = self.inputs_embeds[:num_input_tokens]
974972
input_ids = None
975973
else:
976974
# For text-only models, we use token ids as input.
977975
# While it is possible to use embeddings as input just like the
978976
# multimodal models, it is not desirable for performance since
979-
# then the embedding layer is not included in the CUDA graph.
977+
# then the embedding layer is not included in the ACL Graph.
980978
input_ids = self.input_ids[:num_input_tokens]
981979
inputs_embeds = None
982980
if self.uses_mrope:
983981
positions = self.mrope_positions[:, :num_input_tokens]
984-
else:
985-
positions = self.positions[:num_input_tokens]
986982

987983
if (envs_ascend.VLLM_ENABLE_MC2
988984
or self.torchair_graph_enabled) and not with_prefill:

0 commit comments

Comments
 (0)