Skip to content

Commit 24700c3

Browse files
authored
[V1] Cache uses_mrope in GPUModelRunner (#12969)
1 parent d366ccc commit 24700c3

File tree

1 file changed

+13
-10
lines changed

1 file changed

+13
-10
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def __init__(
9292
# Multi-modal data support
9393
self.input_registry = INPUT_REGISTRY
9494
self.mm_registry = MULTIMODAL_REGISTRY
95+
self.uses_mrope = model_config.uses_mrope
9596

9697
# NOTE: Initialized input mapper is only used for processing dummy
9798
# multimodal data into multimodal kwargs for GPU memory profiling.
@@ -147,7 +148,7 @@ def __init__(
147148
device=self.device)
148149

149150
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
150-
if self.model_config.uses_mrope:
151+
if self.uses_mrope:
151152
# NOTE: `mrope_positions` is implemented with one additional dummy
152153
# position on purpose to make it non-contiguous so that it can work
153154
# with torch compile.
@@ -284,7 +285,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
284285
)
285286

286287
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
287-
if self.model_config.uses_mrope:
288+
if self.uses_mrope:
288289
image_grid_thw = []
289290
video_grid_thw = []
290291
second_per_grid_ts = []
@@ -411,7 +412,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
411412

412413
# Calculate M-RoPE positions.
413414
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
414-
if self.model_config.uses_mrope:
415+
if self.uses_mrope:
415416
self._calc_mrope_positions(scheduler_output)
416417

417418
# Get token indices.
@@ -458,7 +459,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
458459
# Copy the tensors to the GPU.
459460
self.input_ids[:total_num_scheduled_tokens].copy_(
460461
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
461-
if self.model_config.uses_mrope:
462+
if self.uses_mrope:
462463
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
463464
self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
464465
self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
@@ -817,13 +818,14 @@ def execute_model(
817818
# then the embedding layer is not included in the CUDA graph.
818819
input_ids = self.input_ids[:num_input_tokens]
819820
inputs_embeds = None
821+
if self.uses_mrope:
822+
positions = self.mrope_positions[:, :num_input_tokens]
823+
else:
824+
positions = self.positions[:num_input_tokens]
820825

821826
# Run the decoder.
822827
# Use persistent buffers for CUDA graphs.
823828
with set_forward_context(attn_metadata, self.vllm_config):
824-
positions = self.mrope_positions[:, :num_input_tokens] \
825-
if self.model_config.uses_mrope \
826-
else self.positions[:num_input_tokens]
827829
hidden_states = self.model(
828830
input_ids=input_ids,
829831
positions=positions,
@@ -1001,10 +1003,11 @@ def _dummy_run(
10011003
else:
10021004
input_ids = self.input_ids[:num_tokens]
10031005
inputs_embeds = None
1006+
if self.uses_mrope:
1007+
positions = self.mrope_positions[:, :num_tokens]
1008+
else:
1009+
positions = self.positions[:num_tokens]
10041010
with set_forward_context(None, self.vllm_config):
1005-
positions = self.mrope_positions[:, :num_tokens] \
1006-
if self.model_config.uses_mrope \
1007-
else self.positions[:num_tokens]
10081011
hidden_states = model(
10091012
input_ids=input_ids,
10101013
positions=positions,

0 commit comments

Comments
 (0)