From b505ab7dbeab04b87f85b46c08cd180631fb0a04 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 2 Aug 2022 15:04:45 +0200
Subject: [PATCH] Change audio kwarg to images in TROCR processor (#18421)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/trocr/processing_trocr.py          | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 44a276fd63ae6c..752986243f8241 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -54,27 +54,23 @@ def __call__(self, *args, **kwargs):
         if self._in_target_context_manager:
             return self.current_processor(*args, **kwargs)
 
-        if "raw_speech" in kwargs:
-            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
-            audio = kwargs.pop("raw_speech")
-        else:
-            audio = kwargs.pop("audio", None)
+        images = kwargs.pop("images", None)
         text = kwargs.pop("text", None)
         if len(args) > 0:
-            audio = args[0]
+            images = args[0]
             args = args[1:]
 
-        if audio is None and text is None:
-            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
 
-        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if images is not None:
+            inputs = self.feature_extractor(images, *args, **kwargs)
         if text is not None:
             encodings = self.tokenizer(text, **kwargs)
 
         if text is None:
             return inputs
-        elif audio is None:
+        elif images is None:
             return encodings
         else:
             inputs["labels"] = encodings["input_ids"]
@@ -102,7 +98,7 @@ def as_target_processor(self):
         warnings.warn(
             "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
             "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
-            "your audio inputs, or in a separate call."
+            "your images inputs, or in a separate call."
         )
         self._in_target_context_manager = True
         self.current_processor = self.tokenizer