@@ -3434,26 +3434,28 @@ def _dummy_run(
34343434 with self .maybe_dummy_run_with_lora (
34353435 self .lora_config , num_scheduled_tokens , remove_lora
34363436 ):
3437- model_kwargs = self ._init_model_kwargs (num_tokens )
3437+ # Make sure padding doesn't exceed max_num_tokens
3438+ assert num_tokens_after_padding <= self .max_num_tokens
3439+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
34383440 if self .supports_mm_inputs and not self .model_config .is_encoder_decoder :
34393441 input_ids = None
3440- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3442+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
34413443 model_kwargs = {
34423444 ** model_kwargs ,
34433445 ** self ._dummy_mm_kwargs (num_reqs ),
34443446 }
34453447 elif self .enable_prompt_embeds :
34463448 input_ids = None
3447- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3448- model_kwargs = self ._init_model_kwargs (num_tokens )
3449+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
3450+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
34493451 else :
3450- input_ids = self .input_ids .gpu [:num_tokens ]
3452+ input_ids = self .input_ids .gpu [:num_tokens_after_padding ]
34513453 inputs_embeds = None
34523454
34533455 if self .uses_mrope :
3454- positions = self .mrope_positions .gpu [:, :num_tokens ]
3456+ positions = self .mrope_positions .gpu [:, :num_tokens_after_padding ]
34553457 else :
3456- positions = self .positions .gpu [:num_tokens ]
3458+ positions = self .positions .gpu [:num_tokens_after_padding ]
34573459
34583460 if get_pp_group ().is_first_rank :
34593461 intermediate_tensors = None
@@ -3468,7 +3470,7 @@ def _dummy_run(
34683470 )
34693471
34703472 intermediate_tensors = self .sync_and_slice_intermediate_tensors (
3471- num_tokens , None , False
3473+ num_tokens_after_padding , None , False
34723474 )
34733475
34743476 # filter out the valid batch descriptor
0 commit comments