@@ -92,6 +92,7 @@ def __init__(
9292 # Multi-modal data support
9393 self .input_registry = INPUT_REGISTRY
9494 self .mm_registry = MULTIMODAL_REGISTRY
95+ self .uses_mrope = model_config .uses_mrope
9596
9697 # NOTE: Initialized input mapper is only used for processing dummy
9798 # multimodal data into multimodal kwargs for GPU memory profiling.
@@ -147,7 +148,7 @@ def __init__(
147148 device = self .device )
148149
149150 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
150- if self .model_config . uses_mrope :
151+ if self .uses_mrope :
151152 # NOTE: `mrope_positions` is implemented with one additional dummy
152153 # position on purpose to make it non-contiguous so that it can work
153154 # with torch compile.
@@ -284,7 +285,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
284285 )
285286
286287 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
287- if self .model_config . uses_mrope :
288+ if self .uses_mrope :
288289 image_grid_thw = []
289290 video_grid_thw = []
290291 second_per_grid_ts = []
@@ -411,7 +412,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
411412
412413 # Calculate M-RoPE positions.
413414 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
414- if self .model_config . uses_mrope :
415+ if self .uses_mrope :
415416 self ._calc_mrope_positions (scheduler_output )
416417
417418 # Get token indices.
@@ -458,7 +459,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
458459 # Copy the tensors to the GPU.
459460 self .input_ids [:total_num_scheduled_tokens ].copy_ (
460461 self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
461- if self .model_config . uses_mrope :
462+ if self .uses_mrope :
462463 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
463464 self .mrope_positions [:, :total_num_scheduled_tokens ].copy_ (
464465 self .mrope_positions_cpu [:, :total_num_scheduled_tokens ],
@@ -817,13 +818,14 @@ def execute_model(
817818 # then the embedding layer is not included in the CUDA graph.
818819 input_ids = self .input_ids [:num_input_tokens ]
819820 inputs_embeds = None
821+ if self .uses_mrope :
822+ positions = self .mrope_positions [:, :num_input_tokens ]
823+ else :
824+ positions = self .positions [:num_input_tokens ]
820825
821826 # Run the decoder.
822827 # Use persistent buffers for CUDA graphs.
823828 with set_forward_context (attn_metadata , self .vllm_config ):
824- positions = self .mrope_positions [:, :num_input_tokens ] \
825- if self .model_config .uses_mrope \
826- else self .positions [:num_input_tokens ]
827829 hidden_states = self .model (
828830 input_ids = input_ids ,
829831 positions = positions ,
@@ -1001,10 +1003,11 @@ def _dummy_run(
10011003 else :
10021004 input_ids = self .input_ids [:num_tokens ]
10031005 inputs_embeds = None
1006+ if self .uses_mrope :
1007+ positions = self .mrope_positions [:, :num_tokens ]
1008+ else :
1009+ positions = self .positions [:num_tokens ]
10041010 with set_forward_context (None , self .vllm_config ):
1005- positions = self .mrope_positions [:, :num_tokens ] \
1006- if self .model_config .uses_mrope \
1007- else self .positions [:num_tokens ]
10081011 hidden_states = model (
10091012 input_ids = input_ids ,
10101013 positions = positions ,
0 commit comments