File tree Expand file tree Collapse file tree 2 files changed +13
-3
lines changed 
src/transformers/models/granite_speech Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Original file line number Diff line number Diff line change @@ -74,9 +74,15 @@ def __call__(
7474        )
7575        audio_embed_sizes  =  self ._get_num_audio_features (audio_lengths )
7676        speech_inputs ["audio_embed_sizes" ] =  audio_embed_sizes 
77-         # TODO: input_features_mask is not a great name, because 
78-         # input_features and input_features_mask have different shapes 
79-         # (before/after the projector) 
77+         # TODO (@alex-jw-brooks): Currently input_features_mask is not 
78+         # a great name, because input_features and input_features_mask 
79+         # have different shapes (before/after the projector). 
80+         # 
81+         # We should align this with other multimodal models, e.g,. llava 
82+         # and qwen2audio and refactor this to ensure input_feature_mask 
83+         # has the same dimensionality as input_features, or compute it in 
84+         # the model based on the audio embedding sizes (since we do not 
85+         # have an attention mask for the audio features to infer padding from). 
8086        speech_inputs ["input_features_mask" ] =  torch .arange (max (audio_embed_sizes )).view (1 , - 1 ) <  torch .tensor (
8187            audio_embed_sizes 
8288        ).view (- 1 , 1 )
Original file line number Diff line number Diff line change @@ -62,6 +62,10 @@ def __call__(
6262            # trigger the conditions due to the way they call multimodal 
6363            # processors, e.g., vLLM. 
6464            audio_inputs  =  self .audio_processor (audio , device = device )
65+ 
66+             # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens 
67+             # from feature lengths and call it here, rather than returning it 
68+             # from the feature extractor. 
6569            audio_embed_sizes  =  audio_inputs .pop ("audio_embed_sizes" )
6670
6771            # Expand the audio placeholders to match the feature dims; this 
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments