@@ -945,10 +945,7 @@ def _process_reqs(
945945 # Copy the tensors to the NPU.
946946 self .input_ids [:total_num_scheduled_tokens ].copy_ (
947947 self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
948- input_ids = self .input_ids [:num_input_tokens ]
949948
950- # prepare the MRoPE for mllm if using multimodal
951- num_input_tokens = total_num_scheduled_tokens
952949 # _prepare_inputs may reorder the batch, so we must gather multi
953950 # modal outputs after that to ensure the correct order
954951 if self .is_multimodal_model :
@@ -962,27 +959,26 @@ def _process_reqs(
962959 # NOTE(woosuk): To unify token ids and soft tokens (vision
963960 # embeddings), we always use embeddings (rather than token ids)
964961 # as input to the multimodal model, even when the input is text.
965- input_ids = self .input_ids [:num_input_tokens ]
962+ input_ids = self .input_ids [:total_num_scheduled_tokens ]
966963 if mm_embeds :
967964 inputs_embeds = self .model .get_input_embeddings (
968965 input_ids , mm_embeds )
969966 else :
970967 inputs_embeds = self .model .get_input_embeddings (input_ids )
971968 # TODO(woosuk): Avoid the copy. Optimize.
972- self .inputs_embeds [:num_input_tokens ].copy_ (inputs_embeds )
969+ self .inputs_embeds [:total_num_scheduled_tokens ].copy_ (
970+ inputs_embeds )
973971 inputs_embeds = self .inputs_embeds [:num_input_tokens ]
974972 input_ids = None
975973 else :
976974 # For text-only models, we use token ids as input.
977975 # While it is possible to use embeddings as input just like the
978976 # multimodal models, it is not desirable for performance since
979- # then the embedding layer is not included in the CUDA graph .
977+ # then the embedding layer is not included in the ACL Graph .
980978 input_ids = self .input_ids [:num_input_tokens ]
981979 inputs_embeds = None
982980 if self .uses_mrope :
983981 positions = self .mrope_positions [:, :num_input_tokens ]
984- else :
985- positions = self .positions [:num_input_tokens ]
986982
987983 if (envs_ascend .VLLM_ENABLE_MC2
988984 or self .torchair_graph_enabled ) and not with_prefill :
0 commit comments