4242from vllm .inputs import INPUT_REGISTRY
4343from vllm .logger import logger
4444from vllm .model_executor .layers .fused_moe import FusedMoE
45- from vllm .model_executor .layers .rotary_embedding import MRotaryEmbedding
4645from vllm .model_executor .model_loader import get_model
47- from vllm .multimodal import MULTIMODAL_REGISTRY
48- from vllm .multimodal .inputs import MultiModalKwargs , PlaceholderRange
49- from vllm .multimodal .utils import group_mm_inputs_by_modality
46+ from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalKwargs
5047from vllm .sampling_params import SamplingType
5148from vllm .sequence import IntermediateTensors
5249from vllm .utils import (STR_DTYPE_TO_TORCH_DTYPE , DeviceMemoryProfiler ,
6461from vllm .v1 .utils import bind_kv_cache
6562from vllm .v1 .worker .gpu_input_batch import CachedRequestState , InputBatch
6663from vllm .v1 .worker .lora_model_runner_mixin import LoRAModelRunnerMixin
67- from vllm .v1 .worker .utils import (gather_mm_placeholders ,
68- sanity_check_mm_encoder_outputs ,
69- scatter_mm_placeholders )
7064
7165from vllm_ascend .ascend_config import get_ascend_config
7266from vllm_ascend .attention .attention import AttentionMaskBuilder
@@ -373,7 +367,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
373367 # Remove finished requests from the cached states.
374368 for req_id in scheduler_output .finished_req_ids :
375369 self .requests .pop (req_id , None )
376- self .encoder_cache .pop (req_id , None )
377370 # Remove the finished requests from the persistent batch.
378371 # NOTE(woosuk): There could be an edge case where finished_req_ids and
379372 # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -386,14 +379,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
386379 if req_index is not None :
387380 removed_req_indices .append (req_index )
388381
389- # Free the cached encoder outputs.
390- for req_id , input_id in scheduler_output .free_encoder_input_ids :
391- encoder_outputs = self .encoder_cache .get (req_id )
392- if encoder_outputs is not None :
393- encoder_outputs .pop (input_id , None )
394- if not encoder_outputs :
395- self .encoder_cache .pop (req_id , None )
396-
397382 # Remove the unscheduled requests from the persistent batch.
398383 # NOTE(woosuk): The unscheduled requests are either preempted requests
399384 # or running requests that are not scheduled in this step. We remove
@@ -435,43 +420,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
435420 lora_request = new_req_data .lora_request ,
436421 )
437422
438- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
439- if self .uses_mrope :
440- image_grid_thw = []
441- video_grid_thw = []
442- second_per_grid_ts = []
443- audio_feature_lengths = []
444- use_audio_in_video = False
445- for mm_input in self .requests [req_id ].mm_inputs :
446- if mm_input .get ("image_grid_thw" ) is not None :
447- image_grid_thw .extend (
448- mm_input ["image_grid_thw" ].tolist ())
449- if mm_input .get ("video_grid_thw" ) is not None :
450- video_grid_thw .extend (
451- mm_input ["video_grid_thw" ].tolist ())
452- if mm_input .get ("second_per_grid_ts" ) is not None :
453- second_per_grid_ts .extend (
454- mm_input ["second_per_grid_ts" ])
455- if mm_input .get ("audio_feature_lengths" ) is not None :
456- audio_feature_lengths .extend (
457- mm_input ["audio_feature_lengths" ])
458- if mm_input .get ("use_audio_in_video" ) is True :
459- use_audio_in_video = True
460-
461- hf_config = self .model_config .hf_config
462-
463- self .requests [req_id ].mrope_positions , \
464- self .requests [req_id ].mrope_position_delta = \
465- MRotaryEmbedding .get_input_positions_tensor (
466- self .requests [req_id ].prompt_token_ids ,
467- hf_config = hf_config ,
468- image_grid_thw = image_grid_thw ,
469- video_grid_thw = video_grid_thw ,
470- second_per_grid_ts = second_per_grid_ts ,
471- audio_feature_lengths = audio_feature_lengths ,
472- use_audio_in_video = use_audio_in_video ,
473- )
474-
475423 req_ids_to_add .append (req_id )
476424
477425 # Update the states of the running/resumed requests.
@@ -596,166 +544,6 @@ def _make_attention_mask(self, seq_lens, query_lens, position,
596544 else :
597545 return None
598546
599- def _calc_mrope_positions (self , scheduler_output : "SchedulerOutput" ):
600- mrope_pos_ptr = 0
601- for index , req_id in enumerate (self .input_batch .req_ids ):
602- req = self .requests [req_id ]
603- assert req .mrope_positions is not None
604-
605- num_computed_tokens = \
606- self .input_batch .num_computed_tokens_cpu [index ]
607- num_scheduled_tokens = \
608- scheduler_output .num_scheduled_tokens [req_id ]
609- num_prompt_tokens = len (req .prompt_token_ids )
610-
611- if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens :
612- prompt_part_len = max (0 ,
613- num_prompt_tokens - num_computed_tokens )
614- completion_part_len = max (
615- 0 , num_scheduled_tokens - prompt_part_len )
616- else :
617- prompt_part_len = num_scheduled_tokens
618- completion_part_len = 0
619-
620- assert num_scheduled_tokens == prompt_part_len + completion_part_len
621-
622- if prompt_part_len > 0 :
623- # prompt's mrope_positions are pre-computed
624- dst_start = mrope_pos_ptr
625- dst_end = mrope_pos_ptr + prompt_part_len
626- src_start = num_computed_tokens
627- src_end = num_computed_tokens + prompt_part_len
628-
629- self .mrope_positions_cpu [:, dst_start :dst_end ] = \
630- req .mrope_positions [:,src_start :src_end ]
631-
632- mrope_pos_ptr += prompt_part_len
633-
634- if completion_part_len > 0 :
635- # compute completion's mrope_positions on-the-fly
636- dst_start = mrope_pos_ptr
637- dst_end = mrope_pos_ptr + completion_part_len
638-
639- self .mrope_positions_cpu [:, dst_start :dst_end ] = \
640- MRotaryEmbedding .get_next_input_positions_tensor (
641- req .mrope_position_delta ,
642- context_len = num_computed_tokens +
643- prompt_part_len ,
644- seq_len = num_computed_tokens +
645- prompt_part_len +
646- completion_part_len ,
647- )
648-
649- mrope_pos_ptr += completion_part_len
650-
651- def _execute_mm_encoder (self , scheduler_output : "SchedulerOutput" ):
652- scheduled_encoder_inputs = scheduler_output .scheduled_encoder_inputs
653- if not scheduled_encoder_inputs :
654- return
655-
656- # Batch the multi-modal inputs.
657- mm_inputs = list [MultiModalKwargs ]()
658- req_ids_pos = list [tuple [str , int , PlaceholderRange ]]()
659- for req_id , encoder_input_ids in scheduled_encoder_inputs .items ():
660- req_state = self .requests [req_id ]
661-
662- for mm_input_id in encoder_input_ids :
663- mm_inputs .append (req_state .mm_inputs [mm_input_id ])
664- req_ids_pos .append (
665- (req_id , mm_input_id , req_state .mm_positions [mm_input_id ]))
666-
667- # Batch mm inputs as much as we can: if a request in the batch has
668- # multiple modalities or a different modality than the previous one,
669- # we process it separately to preserve item order.
670- # FIXME(ywang96): This is a hacky way to deal with multiple modalities
671- # in the same batch while still being able to benefit from batching
672- # multimodal inputs. The proper solution should be reordering the
673- # encoder outputs.
674- grouped_mm_inputs_list = group_mm_inputs_by_modality (mm_inputs )
675-
676- encoder_outputs = []
677- for grouped_mm_inputs in grouped_mm_inputs_list :
678- batched_mm_inputs = MultiModalKwargs .batch (grouped_mm_inputs )
679- batched_mm_inputs = MultiModalKwargs .as_kwargs (batched_mm_inputs ,
680- device = self .device )
681-
682- # Run the encoder.
683- # `curr_group_outputs` is either of the following:
684- # 1. A tensor of shape (num_items, feature_size, hidden_size)
685- # in case feature_size is fixed across all multimodal items.
686- # 2. A list or tuple (length: num_items) of tensors, each of shape
687- # (feature_size, hidden_size) in case the feature size is dynamic
688- # depending on the input multimodal items.
689- curr_group_outputs = self .model .get_multimodal_embeddings (
690- ** batched_mm_inputs )
691-
692- sanity_check_mm_encoder_outputs (
693- curr_group_outputs ,
694- expected_num_items = len (grouped_mm_inputs ),
695- )
696-
697- for output in curr_group_outputs :
698- encoder_outputs .append (output )
699-
700- # Cache the encoder outputs.
701- for (req_id , input_id , pos_info ), output in zip (
702- req_ids_pos ,
703- encoder_outputs ,
704- ):
705- if req_id not in self .encoder_cache :
706- self .encoder_cache [req_id ] = {}
707-
708- self .encoder_cache [req_id ][input_id ] = scatter_mm_placeholders (
709- output ,
710- is_embed = pos_info .is_embed ,
711- )
712-
713- def _gather_mm_embeddings (
714- self ,
715- scheduler_output : "SchedulerOutput" ,
716- ) -> list [torch .Tensor ]:
717- mm_embeds : list [torch .Tensor ] = []
718- for req_id in self .input_batch .req_ids :
719- num_scheduled_tokens = scheduler_output .num_scheduled_tokens [
720- req_id ]
721- req_state = self .requests [req_id ]
722- num_computed_tokens = req_state .num_computed_tokens
723- mm_positions = req_state .mm_positions
724- for i , pos_info in enumerate (mm_positions ):
725- start_pos = pos_info .offset
726- num_encoder_tokens = pos_info .length
727-
728- # The encoder output is needed if the two ranges overlap:
729- # [num_computed_tokens,
730- # num_computed_tokens + num_scheduled_tokens) and
731- # [start_pos, start_pos + num_encoder_tokens)
732- if start_pos >= num_computed_tokens + num_scheduled_tokens :
733- # The encoder output is not needed in this step.
734- break
735- if start_pos + num_encoder_tokens <= num_computed_tokens :
736- # The encoder output is already processed and stored
737- # in the decoder's KV cache.
738- continue
739-
740- start_idx = max (num_computed_tokens - start_pos , 0 )
741- end_idx = min (
742- num_computed_tokens - start_pos + num_scheduled_tokens ,
743- num_encoder_tokens )
744- assert start_idx < end_idx
745- assert req_id in self .encoder_cache
746- assert i in self .encoder_cache [req_id ]
747- encoder_output = self .encoder_cache [req_id ][i ]
748-
749- if (is_embed := pos_info .is_embed ) is not None :
750- is_embed = is_embed [start_idx :end_idx ]
751-
752- mm_embeds_item = gather_mm_placeholders (
753- encoder_output [start_idx :end_idx ],
754- is_embed = is_embed ,
755- )
756- mm_embeds .append (mm_embeds_item )
757- return mm_embeds
758-
759547 def _process_reqs (
760548 self ,
761549 scheduler_output : "SchedulerOutput" ,
@@ -818,17 +606,6 @@ def _process_reqs(
818606 arange ,
819607 out = positions_np )
820608
821- # Calculate M-RoPE positions.
822- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
823- if self .uses_mrope :
824- self ._calc_mrope_positions (scheduler_output )
825-
826- if self .uses_mrope :
827- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
828- self .mrope_positions [:, :total_num_scheduled_tokens ].copy_ (
829- self .mrope_positions_cpu [:, :total_num_scheduled_tokens ],
830- non_blocking = True )
831-
832609 self .positions [:total_num_scheduled_tokens ].copy_ (
833610 self .positions_cpu [:total_num_scheduled_tokens ], non_blocking = True )
834611 positions = self .positions [:num_input_tokens ]
@@ -948,43 +725,6 @@ def _process_reqs(
948725 input_ids = self .input_ids [:padded_batch_size ]
949726 positions = self .positions [:padded_batch_size ]
950727
951- # prepare the MRoPE for mllm if using multimodal
952- num_input_tokens = total_num_scheduled_tokens
953- # _prepare_inputs may reorder the batch, so we must gather multi
954- # modal outputs after that to ensure the correct order
955- if self .is_multimodal_model :
956- # Run the multimodal encoder if any.
957- self ._execute_mm_encoder (scheduler_output )
958- mm_embeds = self ._gather_mm_embeddings (scheduler_output )
959- else :
960- mm_embeds = []
961-
962- if self .is_multimodal_model :
963- # NOTE(woosuk): To unify token ids and soft tokens (vision
964- # embeddings), we always use embeddings (rather than token ids)
965- # as input to the multimodal model, even when the input is text.
966- input_ids = self .input_ids [:num_input_tokens ]
967- if mm_embeds :
968- inputs_embeds = self .model .get_input_embeddings (
969- input_ids , mm_embeds )
970- else :
971- inputs_embeds = self .model .get_input_embeddings (input_ids )
972- # TODO(woosuk): Avoid the copy. Optimize.
973- self .inputs_embeds [:num_input_tokens ].copy_ (inputs_embeds )
974- inputs_embeds = self .inputs_embeds [:num_input_tokens ]
975- input_ids = None
976- else :
977- # For text-only models, we use token ids as input.
978- # While it is possible to use embeddings as input just like the
979- # multimodal models, it is not desirable for performance since
980- # then the embedding layer is not included in the CUDA graph.
981- input_ids = self .input_ids [:num_input_tokens ]
982- inputs_embeds = None
983- if self .uses_mrope :
984- positions = self .mrope_positions [:, :num_input_tokens ]
985- else :
986- positions = self .positions [:num_input_tokens ]
987-
988728 # Run forward pass
989729 with set_forward_context (attn_metadata ,
990730 self .vllm_config ,
@@ -1001,7 +741,7 @@ def _process_reqs(
1001741 input_ids = input_ids ,
1002742 positions = positions ,
1003743 intermediate_tensors = intermediate_tensors ,
1004- inputs_embeds = inputs_embeds ,
744+ inputs_embeds = None ,
1005745 ** model_kwargs ,
1006746 )
1007747 else :
@@ -1010,7 +750,7 @@ def _process_reqs(
1010750 input_ids = input_ids ,
1011751 positions = positions ,
1012752 intermediate_tensors = intermediate_tensors ,
1013- inputs_embeds = inputs_embeds ,
753+ inputs_embeds = None ,
1014754 ** model_kwargs ,
1015755 )
1016756
@@ -1493,11 +1233,8 @@ def _dummy_run(
14931233 return hidden_states
14941234
14951235 def profile_run (self ) -> None :
1496- # FIXME Profile with multimodal encoder & encoder cache.
1497- # current _profile_multimodal() using PyTorch SDPA backend method not
1498- # support for window/full attn to reduce Memcpy operations, so will cause
1499- # Out Of Memory problem, so we currently don't use self._profile_multimodal()
1500- # self._profile_multimodal()
1236+ # Profile with multimodal encoder & encoder cache.
1237+ self ._profile_multimodal ()
15011238
15021239 # For profile, have maximum num_reqs and that collectively have
15031240 # maximum num_tokens.
0 commit comments