diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b31571a7c000..facc79bb5ba0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3401,26 +3401,28 @@ def _dummy_run( with self.maybe_dummy_run_with_lora( self.lora_config, num_scheduled_tokens, remove_lora ): - model_kwargs = self._init_model_kwargs(num_tokens) + # Make sure padding doesn't exceed max_num_tokens + assert num_tokens_after_padding <= self.max_num_tokens + model_kwargs = self._init_model_kwargs(num_tokens_after_padding) if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding] model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), } elif self.enable_prompt_embeds: input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens] - model_kwargs = self._init_model_kwargs(num_tokens) + inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding] + model_kwargs = self._init_model_kwargs(num_tokens_after_padding) else: - input_ids = self.input_ids.gpu[:num_tokens] + input_ids = self.input_ids.gpu[:num_tokens_after_padding] inputs_embeds = None if self.uses_mrope: - positions = self.mrope_positions.gpu[:, :num_tokens] + positions = self.mrope_positions.gpu[:, :num_tokens_after_padding] else: - positions = self.positions.gpu[:num_tokens] + positions = self.positions.gpu[:num_tokens_after_padding] if get_pp_group().is_first_rank: intermediate_tensors = None @@ -3435,7 +3437,7 @@ def _dummy_run( ) intermediate_tensors = self.sync_and_slice_intermediate_tensors( - num_tokens, None, False + num_tokens_after_padding, None, False ) # filter out the valid batch descriptor