huggingface · ylacombe · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 19, 2024
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -582,8 +582,11 @@ def _forward(self, model_inputs, return_timestamps=False, generate_kwargs=None):
                     out["stride"] = stride
 
         else:
-            input_values = model_inputs.pop("input_values")
-            outputs = self.model(input_values=input_values, attention_mask=attention_mask)
+            inputs = {
+                self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
+                "attention_mask": attention_mask,
+            }
+            outputs = self.model(**inputs)
             logits = outputs.logits
 
             if self.type == "ctc_with_lm":

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -298,6 +298,23 @@ def test_torch_large(self):
         output = speech_recognizer(filename)
         self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
 
+    @require_torch
+    @slow
+    def test_torch_large_with_input_features(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="hf-audio/wav2vec2-bert-CV16-en",
+            framework="pt",
+        )
+        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
     Args: 
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): 
             Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file 
             into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install 
             soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and 
             conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. 
     Args: 
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): 
             Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file 
             into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install 
             soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and 
             conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. 
+        output = speech_recognizer(waveform)
+        self.assertEqual(output, {"text": ""})
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        filename = ds[40]["file"]
+        output = speech_recognizer(filename)
+        self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
+
     @slow
     @require_torch
     @slow