From 1a78ce82f4e47a38b9bdee0b477c13d8febe937b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 24 Jan 2024 21:27:24 +0000
Subject: [PATCH 01/25] up

---
 .../models/whisper/generation_whisper.py      | 265 +++++++++---------
 1 file changed, 140 insertions(+), 125 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index c45fffb984b113..4acdf3328137aa 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -41,6 +41,11 @@
 logger = logging.get_logger(__name__)
 
 
+from transformers import AutoTokenizer
+
+tok = AutoTokenizer.from_pretrained("openai/whisper-medium.en")
+
+
 def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
     """
     Applies a median filter of width `filter_width` along the last dimension of the input.
@@ -260,6 +265,7 @@ def generate(
         language: Optional[str] = None,
         is_multilingual: Optional[bool] = None,
         prompt_ids: Optional[torch.Tensor] = None,
+        prompt_condition_type: str = "first-segment",  # first-segment, all-segments
         condition_on_prev_tokens: Optional[bool] = None,
         temperature: Optional[Union[float, Tuple[float, ...]]] = None,
         compression_ratio_threshold: Optional[float] = None,
@@ -505,15 +511,6 @@ def generate(
         self._set_language_and_task(
             language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
         )
-        # pass self.config for backward compatibility
-        self._set_forced_decoder_ids(
-            task=task,
-            language=language,
-            prompt_ids=prompt_ids,
-            generation_config=generation_config,
-            config=self.config,
-            kwargs=kwargs,
-        )
         self._set_token_ids(generation_config=generation_config, config=self.config, kwargs=kwargs)
         self._set_num_frames(
             return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
@@ -525,12 +522,19 @@ def generate(
             no_speech_threshold=no_speech_threshold,
             condition_on_prev_tokens=condition_on_prev_tokens,
         )
+        # pass self.config for backward compatibility
+        init_tokens = self.retrieve_init_tokens(
+            generation_config=generation_config,
+            config=self.config,
+            kwargs=kwargs,
+        )
 
         # 4. Retrieve logits processors
+        begin_index = len(init_tokens)
         logits_processor = self._retrieve_logit_processors(
             generation_config=generation_config,
             logits_processor=logits_processor,
-            no_speech_threshold=no_speech_threshold,
+            begin_index=begin_index, # begin index is index of first generated decoder token
             is_shortform=is_shortform,
             num_beams=kwargs.get("num_beams", 1),
         )
@@ -540,6 +544,14 @@ def generate(
             if temperature is not None:
                 kwargs["temperature"] = temperature
 
+            decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+
+            if decoder_input_ids is not None and len(init_tokens) > 0:
+                logger.warn(f"You have provided `decoder_input_ids` which will overwrite the `init_tokens` {init_tokens}. This might lead to unexpected behavior.")
+            elif len(init_tokens) > 0:
+                one_tensor = torch.ones((input_features.shape[0], 1), device=input_features.device, dtype=torch.long)
+                decoder_input_ids = torch.cat([t * one_tensor for t in init_tokens], dim=-1)
+
             outputs = super().generate(
                 input_features,
                 generation_config=generation_config,
@@ -547,6 +559,7 @@ def generate(
                 stopping_criteria=stopping_criteria,
                 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
                 synced_gpus=synced_gpus,
+                decoder_input_ids=decoder_input_ids,
                 **kwargs,
             )
 
@@ -573,11 +586,11 @@ def generate(
         max_frames, seek = self._retrieve_max_frames_and_seek(
             batch_size=batch_size, attention_mask=attention_mask, total_input_frames=total_input_frames
         )
-        init_tokens = self._retrieve_init_tokens_from_forced_decoder_ids(generation_config=generation_config)
 
         # 6.2 Preppare running variables, list for generation
         cur_bsz = batch_size
-        current_segments = [[] for _ in range(batch_size)]
+        current_segments = self._prepare_segments(prompt_ids=prompt_ids, batch_size=batch_size, prompt_condition_type=prompt_condition_type, generation_config=generation_config)
+
         batch_idx_map = list(range(batch_size))
         do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(batch_size)]
 
@@ -617,6 +630,7 @@ def generate(
                 current_segments=current_segments,
                 batch_idx_map=batch_idx_map,
                 do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                prompt_ids=prompt_ids,
                 generation_config=generation_config,
                 config=self.config,
                 device=segment_input.device,
@@ -682,11 +696,12 @@ def generate(
 
         # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
         # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
-        sequences = _pad_to_max_length(current_segments, generation_config.pad_token_id, padding="right")
+        final_segments = [x[1:] for x in current_segments] if (prompt_ids is not None and prompt_condition_type == "first-segment") else current_segments
+        sequences = _pad_to_max_length(final_segments, generation_config.pad_token_id, padding="right")
 
         # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
         if return_segments:
-            return {"sequences": sequences, "segments": current_segments}
+            return {"sequences": sequences, "segments": final_segments}
 
         return sequences
 
@@ -721,9 +736,19 @@ def generate_with_fallback(
 
         for fallback_idx, temperature in enumerate(temperatures):
             generation_config.do_sample = temperature is not None and temperature > 0.0
-            generation_config.temperature = temperature
+
+            if generation_config.do_sample:
+                generation_config.temperature = temperature
+            else:
+                # default
+                generation_config.temperature = 1.0
+
             generation_config.num_beams = kwargs.pop("num_beams", 1) if not generation_config.do_sample else 1
 
+            # print(decoder_input_ids)
+            print(tok.batch_decode(decoder_input_ids, skip_special_tokens=False))
+            print(temperature)
+
             seek_outputs = super().generate(
                 segment_input,
                 generation_config,
@@ -777,8 +802,9 @@ def generate_with_fallback(
 
                 seek_sequence_list[fallback_index_map[i]] = seek_sequence
                 seek_outputs_list[fallback_index_map[i]] = seek_outputs[i]
+                is_low_temperature = temperature is None or temperature < 0.5
                 do_condition_on_prev_tokens[fallback_index_map[i]] = (
-                    generation_config.condition_on_prev_tokens and temperature is not None and temperature < 0.5
+                    generation_config.condition_on_prev_tokens and is_low_temperature
                 )
 
                 if needs_fallback[i]:
@@ -804,6 +830,23 @@ def generate_with_fallback(
 
         return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens
 
+
+    @staticmethod
+    def _prepare_segments(prompt_ids, batch_size, prompt_condition_type, generation_config):
+        generation_config.prompt_condition_type = prompt_condition_type
+
+        if prompt_ids is not None and prompt_condition_type == "first-segment":
+            prompt_ids = prompt_ids[1:] if prompt_ids[0] == generation_config.prev_sot_token_id else prompt_ids
+            current_segments = [[{"tokens": prompt_ids}] for _ in range(batch_size)]
+        else:
+            current_segments = [[] for _ in range(batch_size)]
+
+        if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
+            raise ValueError("Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`.")
+
+        return current_segments
+
+
     def _postprocess_outputs(self, seek_outputs, return_token_timestamps, generation_config):
         if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
             num_frames = getattr(generation_config, "num_frames", None)
@@ -1017,7 +1060,7 @@ def _set_language_and_task(language, task, is_multilingual, generation_config):
             generation_config.task = task
 
     @staticmethod
-    def _set_forced_decoder_ids(task, language, prompt_ids, generation_config, config, kwargs):
+    def retrieve_init_tokens(generation_config, config, kwargs):
         forced_decoder_ids = None
         # Legacy code for backward compatibility
         if hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
@@ -1027,83 +1070,73 @@ def _set_forced_decoder_ids(task, language, prompt_ids, generation_config, confi
         else:
             forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
 
-        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
-            forced_decoder_ids = []
-            if hasattr(generation_config, "language"):
-                if generation_config.language in generation_config.lang_to_id.keys():
-                    language_token = generation_config.language
-                elif generation_config.language in TO_LANGUAGE_CODE.keys():
-                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
-                elif generation_config.language in TO_LANGUAGE_CODE.values():
-                    language_token = f"<|{generation_config.language}|>"
-                else:
-                    is_language_code = len(generation_config.language) == 2
-                    raise ValueError(
-                        f"Unsupported language: {generation_config.language}. Language should be one of:"
-                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
-                    )
-                if language_token not in generation_config.lang_to_id:
-                    raise ValueError(
-                        f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
-                        "(You should just add it to the generation config)"
-                    )
-                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+        task = getattr(generation_config, "task", None)
+        language = getattr(generation_config, "language", None)
+
+        if forced_decoder_ids is not None and task is not None:
+            logger.warn(f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. Make sure to either remove `forced_decoder_ids` from your `generation_config` or don't set `task`. `forced_decoder_ids` will be ignored in favor of task={task}.")
+            forced_decoder_ids = None
+        elif forced_decoder_ids is not None and language is not None:
+            logger.warn(f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. Make sure to either remove `forced_decoder_ids` from your `generation_config` or don't set `language`. `forced_decoder_ids` will be ignored in favor of language={language}.")
+            forced_decoder_ids = None
+
+        init_tokens = [generation_config.decoder_start_token_id]
+        if forced_decoder_ids is not None and forced_decoder_ids[0][0] == 1:
+            i = 1
+            while len(forced_decoder_ids) > 0 and forced_decoder_ids[0][0] == i:
+                init_tokens += [forced_decoder_ids[0][1]]
+                forced_decoder_ids = forced_decoder_ids[1:]
+                i += 1
+
+            if len(forced_decoder_ids) > 0:
+                warnings.warn(
+                    f"You are using token ids in `forced_decoder_ids` that do not seem to be part of the initial prompt ids: {forced_decoder_ids}. This functionality has been deprecated and will throw an error in v4.39.",
+                    FutureWarning,
+                )
+
+            # TODO(Sanchit): set generation_config.forced_decoder_ids to None for v4.39
+            generation_config.forced_decoder_ids = forced_decoder_ids if len(forced_decoder_ids) > 0 else None
+
+        if language is not None:
+            if language in generation_config.lang_to_id.keys():
+                language_token = language
+            elif language in TO_LANGUAGE_CODE.keys():
+                language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+            elif language in TO_LANGUAGE_CODE.values():
+                language_token = f"<|{language}|>"
             else:
-                forced_decoder_ids.append((1, None))  # automatically detect the language
-
-            if hasattr(generation_config, "task"):
-                if generation_config.task in TASK_IDS:
-                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
-                else:
-                    raise ValueError(
-                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
-                    )
-            elif hasattr(generation_config, "task_to_id"):
-                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
-            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
-                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
-                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
-
-        if forced_decoder_ids is not None:
-            generation_config.forced_decoder_ids = forced_decoder_ids
-
-        if prompt_ids is not None:
-            if kwargs.get("decoder_start_token_id") is not None:
+                is_language_code = len(language) == 2
                 raise ValueError(
-                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                    f"Unsupported language: {language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                 )
-            prompt_ids = prompt_ids.tolist()
-            decoder_start_token_id, *text_prompt_ids = prompt_ids
-            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
-            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
-            text_prompt_ids = text_prompt_ids[-config.max_target_positions // 2 - 1 :]
-            # Set the decoder_start_token_id to <|startofprev|>
-            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
-
-            # If the user passes `max_new_tokens`, increase its number to account for the prompt
-            if kwargs.get("max_new_tokens", None) is not None:
-                kwargs["max_new_tokens"] += len(text_prompt_ids)
-                if kwargs["max_new_tokens"] >= config.max_target_positions:
-                    raise ValueError(
-                        f"The length of the sliced `prompt_ids` is {len(text_prompt_ids)}, and the `max_new_tokens` "
-                        f"{kwargs['max_new_tokens'] - len(text_prompt_ids)}. Thus, the combined length of the sliced "
-                        f"`prompt_ids` and `max_new_tokens` is: {kwargs['max_new_tokens']}. This exceeds the "
-                        f"`max_target_positions` of the Whisper model: {config.max_target_positions}. "
-                        "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
-                        f"so that their combined length is less that {config.max_target_positions}."
-                    )
-
-            # Reformat the forced_decoder_ids to incorporate the prompt
-            non_prompt_forced_decoder_ids = (
-                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
-            )
-            forced_decoder_ids = [
-                *text_prompt_ids,
-                generation_config.decoder_start_token_id,
-                *[token for _, token in non_prompt_forced_decoder_ids],
-            ]
-            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
-            generation_config.forced_decoder_ids = forced_decoder_ids
+            if language_token not in generation_config.lang_to_id:
+                raise ValueError(
+                    f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                    "(You should just add it to the generation config)"
+                )
+        elif task is not None:
+            # if task was passed, but language was not set, default to English
+            language_token = f"<|en|>"
+
+        init_tokens.append(generation_config.lang_to_id[language_token])
+
+        if task is not None:
+            if task in TASK_IDS:
+                init_tokens.append(generation_config.task_to_id[generation_config.task])
+            else:
+                raise ValueError(
+                    f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                )
+        elif hasattr(generation_config, "task_to_id"):
+            init_tokens.append(generation_config.task_to_id["transcribe"])  # defaults to transcribe
+
+        if not generation_config.return_timestamps:
+            init_tokens.append(generation_config.no_timestamps_token_id)
+        elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
+            init_tokens = init_tokens[-1:]
+
+        return init_tokens
 
     @staticmethod
     def _set_token_ids(generation_config, config, kwargs):
@@ -1186,37 +1219,9 @@ def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames
 
         return max_frames, seek
 
-    @staticmethod
-    def _retrieve_init_tokens_from_forced_decoder_ids(generation_config):
-        init_tokens = [generation_config.decoder_start_token_id]
-        forced_decoder_ids = generation_config.forced_decoder_ids
-        if forced_decoder_ids is not None and forced_decoder_ids[0][0] == 1:
-            i = 1
-            while len(forced_decoder_ids) > 0 and forced_decoder_ids[0][0] == i:
-                init_tokens += [forced_decoder_ids[0][1]]
-                forced_decoder_ids = forced_decoder_ids[1:]
-                i += 1
-
-            forced_decoder_ids = forced_decoder_ids if len(forced_decoder_ids) > 0 else None
-            generation_config.forced_decoder_ids = forced_decoder_ids
-
-        return init_tokens
-
     def _retrieve_logit_processors(
-        self, generation_config, logits_processor, no_speech_threshold, is_shortform, num_beams
+        self, generation_config, logits_processor, begin_index, is_shortform, num_beams
     ):
-        forced_decoder_ids = generation_config.forced_decoder_ids
-        if generation_config.return_timestamps is True:
-            last_forced_decoder_ids = forced_decoder_ids[-1][-1] if forced_decoder_ids is not None else None
-            if last_forced_decoder_ids == generation_config.no_timestamps_token_id:
-                # remove no_timestamp to be forcefully generated if we want to return timestamps
-                # this is also important to make sure `WhisperTimeStampLogitsProcessor` functions correctly
-                forced_decoder_ids = forced_decoder_ids[:-1] if len(forced_decoder_ids) > 1 else None
-                # Make sure that if list is empty we set it to None
-                generation_config.forced_decoder_ids = forced_decoder_ids
-
-        begin_index = len(forced_decoder_ids) + 1 if forced_decoder_ids is not None else 1
-
         if generation_config.return_timestamps is True:
             timestamp_processor = WhisperTimeStampLogitsProcessor(generation_config, begin_index=begin_index)
             logits_processor = (
@@ -1243,7 +1248,7 @@ def _retrieve_logit_processors(
             )
             generation_config.begin_suppress_tokens = None
 
-        if no_speech_threshold is not None and not is_shortform:
+        if generation_config.no_speech_threshold is not None and not is_shortform:
             no_speech_detector = WhisperNoSpeechDetection(
                 no_speech_token=generation_config.no_timestamps_token_id - 1,
                 begin_index=begin_index,
@@ -1256,11 +1261,12 @@ def _retrieve_logit_processors(
 
         if is_shortform and generation_config.forced_decoder_ids is not None:
             forced_tokens_proc = ForceTokensLogitsProcessor(generation_config.forced_decoder_ids)
-            # TODO(Patrick): It's important that the `forced_tokens_proc` processor is appended after
+            # It's important that the `forced_tokens_proc` processor is appended after
             # the suppress_tokens processor or else it might happen that all token logits are suppressed to -inf
             # which would lead to unexpected behavior
             # The better approach here is to NOT make use of the `forced_tokens_proc` for Whisper and instead
             # initialize all of them as `decoder_input_ids`.
+            # TODO(Sanchit): Make sure to deprecate this in v4.39 as there will be no `forced_decoder_ids` anymore.
             logits_processor = (
                 [forced_tokens_proc] if logits_processor is None else logits_processor + [forced_tokens_proc]
             )
@@ -1310,6 +1316,7 @@ def _prepare_decoder_input_ids(
         current_segments,
         batch_idx_map,
         do_condition_on_prev_tokens,
+        prompt_ids,
         generation_config,
         config,
         device,
@@ -1328,19 +1335,27 @@ def _prepare_decoder_input_ids(
         if any(do_condition_on_prev_tokens) and len(current_segments[0]) > 0:
             # according to https://github.com/openai/whisper/blob/e58f28804528831904c3b6f2c0e473f346223433/whisper/decoding.py#L609
             active_segments = [current_segments[i] if do_condition_on_prev_tokens[i] else None for i in batch_idx_map]
-            prev_start_of_text = getattr(generation_config, "prev_bos_token_id", None) or prev_start_of_text
 
-            bos_token_tensor = prev_start_of_text * one_tensor[0]
+            if prompt_ids is not None and generation_config.prompt_condition_type == "all-segments":
+                prev_ids = prompt_ids
+            else:
+                prev_ids = prev_start_of_text * one_tensor[0]
+
             prev_tokens = _pad_to_max_length(
                 active_segments,
                 generation_config.pad_token_id,
                 padding="left",
-                bos_token_tensor=bos_token_tensor,
+                bos_token_tensor=prev_ids,
                 cut_off_length=cut_off_length,
             )
             decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
 
             kwargs["decoder_attention_mask"] = decoder_input_ids != generation_config.pad_token_id
+        elif prompt_ids is not None:
+            prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1) 
+            decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
+            # make sure `"decoder_attention_mask"` is not passed to forward
+            kwargs.pop("decoder_attention_mask", None)
         else:
             # make sure `"decoder_attention_mask"` is not passed to forward
             kwargs.pop("decoder_attention_mask", None)

From afdc51d37d97e96fc5c17d00a638b92a5192c2fc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 24 Jan 2024 22:09:43 +0000
Subject: [PATCH 02/25] Fix more

---
 .../models/whisper/generation_whisper.py        | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 4acdf3328137aa..bb1e1426e70e4f 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1008,15 +1008,6 @@ def _set_return_timestamps(return_timestamps, is_shortform, generation_config):
                     "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features."
                 )
 
-            if not hasattr(generation_config, "no_timestamps_token_id"):
-                raise ValueError(
-                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
-                    "requires the generation config to have `no_timestamps_token_id` correctly. "
-                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
-                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
-                    "or make sure to pass no more than 3000 mel input features."
-                )
-
             logger.info("Setting `return_timestamps=True` for long-form generation.")
             generation_config.return_timestamps = True
         else:
@@ -1115,11 +1106,11 @@ def retrieve_init_tokens(generation_config, config, kwargs):
                     f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
                     "(You should just add it to the generation config)"
                 )
+            init_tokens.append(generation_config.lang_to_id[language_token])
         elif task is not None:
-            # if task was passed, but language was not set, default to English
-            language_token = f"<|en|>"
+            # if task was passed, but language was not set, default to English (first lang token)
+            language_token = generation_config.decoder_start_token_id + 1
 
-        init_tokens.append(generation_config.lang_to_id[language_token])
 
         if task is not None:
             if task in TASK_IDS:
@@ -1131,7 +1122,7 @@ def retrieve_init_tokens(generation_config, config, kwargs):
         elif hasattr(generation_config, "task_to_id"):
             init_tokens.append(generation_config.task_to_id["transcribe"])  # defaults to transcribe
 
-        if not generation_config.return_timestamps:
+        if not generation_config.return_timestamps and hasattr(generation_config, "no_timestamps_token_id"):
             init_tokens.append(generation_config.no_timestamps_token_id)
         elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
             init_tokens = init_tokens[-1:]

From 5a2eac04e44c133bdf627b8bbfdabe691bc6c3ff Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 11:46:04 +0000
Subject: [PATCH 03/25] Correct more

---
 .../models/whisper/generation_whisper.py      | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index bb1e1426e70e4f..4f8d6f7fbff285 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -993,15 +993,7 @@ def _set_return_outputs(
 
     @staticmethod
     def _set_return_timestamps(return_timestamps, is_shortform, generation_config):
-        if return_timestamps is True:
-            if not hasattr(generation_config, "no_timestamps_token_id"):
-                raise ValueError(
-                    "You are trying to return timestamps, but the generation config is not properly set. "
-                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
-                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
-                )
-            generation_config.return_timestamps = True
-        elif not is_shortform:
+        if not is_shortform:
             if return_timestamps is False:
                 raise ValueError(
                     "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
@@ -1009,9 +1001,16 @@ def _set_return_timestamps(return_timestamps, is_shortform, generation_config):
                 )
 
             logger.info("Setting `return_timestamps=True` for long-form generation.")
-            generation_config.return_timestamps = True
-        else:
-            generation_config.return_timestamps = False
+            return_timestamps = True
+
+        if return_timestamps and not hasattr(generation_config, "no_timestamps_token_id"):
+            raise ValueError(
+                "You are trying to return timestamps, but the generation config is not properly set. "
+                "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
+                "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+            )
+
+        generation_config.return_timestamps = return_timestamps
 
     @staticmethod
     def _set_language_and_task(language, task, is_multilingual, generation_config):

From 20d62b992b51cd942a056283b28785464a52d32c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 15:16:27 +0000
Subject: [PATCH 04/25] Fix more tests

---
 .../models/whisper/generation_whisper.py      | 34 ++++++++++++++++---
 tests/models/whisper/test_modeling_whisper.py | 20 +++++------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 4f8d6f7fbff285..a2930032bbea95 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -528,8 +528,11 @@ def generate(
             config=self.config,
             kwargs=kwargs,
         )
+        # TODO(Sanchit) - passing `decoder_input_ids` is deprecated. One should use `prompt_ids` instead
+        # This function should be be removed in v4.39
+        self._check_decoder_input_ids(prompt_ids=prompt_ids, is_shortform=is_shortform, kwargs=kwargs)
 
-        # 4. Retrieve logits processors
+        # 3. Retrieve logits processors
         begin_index = len(init_tokens)
         logits_processor = self._retrieve_logit_processors(
             generation_config=generation_config,
@@ -545,13 +548,24 @@ def generate(
                 kwargs["temperature"] = temperature
 
             decoder_input_ids = kwargs.pop("decoder_input_ids", None)
-
-            if decoder_input_ids is not None and len(init_tokens) > 0:
-                logger.warn(f"You have provided `decoder_input_ids` which will overwrite the `init_tokens` {init_tokens}. This might lead to unexpected behavior.")
-            elif len(init_tokens) > 0:
+            if decoder_input_ids is None:
                 one_tensor = torch.ones((input_features.shape[0], 1), device=input_features.device, dtype=torch.long)
                 decoder_input_ids = torch.cat([t * one_tensor for t in init_tokens], dim=-1)
 
+            if prompt_ids is not None:
+                decoder_input_ids = torch.cat([prompt_ids[None].repeat(input_features.shape[0], 1), decoder_input_ids], dim=-1)
+
+            if kwargs.get("max_new_tokens", 0) + decoder_input_ids.shape[-1] > self.config.max_target_positions:
+                max_new_tokens = kwargs.get("max_new_tokens", 0)
+                raise ValueError(
+                    f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
+                    f"is {max_new_tokens}. Thus, the combined length of "
+                    f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
+                    f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
+                    "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+                    f"so that their combined length is less than {self.config.max_target_positions}."
+                )
+
             outputs = super().generate(
                 input_features,
                 generation_config=generation_config,
@@ -1128,6 +1142,16 @@ def retrieve_init_tokens(generation_config, config, kwargs):
 
         return init_tokens
 
+    @staticmethod
+    def _check_decoder_input_ids(prompt_ids, is_shortform, kwargs):
+        decoder_input_ids = kwargs.get("decoder_input_ids", None)
+        if prompt_ids is not None and decoder_input_ids is not None:
+            raise ValueError(f"Cannot pass both `prompt_ids`: {prompt_ids} and `decoder_input_ids`: {decoder_input_ids}. Passing `decoder_input_ids` is deprecated, consider not passing it.")
+        elif decoder_input_ids is not None and not is_shortform:
+            raise ValueError(f"Cannot pass both `decoder_input_ids`: {decoder_input_ids} for long-form generation. Consider passing `prompt_ids` instead.")
+        elif decoder_input_ids is not None and is_shortform:
+            warnings.warn(f"You have provided `decoder_input_ids` which will overwrite the `init_tokens` {init_tokens}. This might lead to unexpected behavior. Passing `decoder_input_ids` is deprecated and will be removed in v4.39. Consider passing `prompt_ids` instead.", FutureWarning)
+
     @staticmethod
     def _set_token_ids(generation_config, config, kwargs):
         eos_token_id = kwargs.pop("eos_token_id", None)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 505d2e991033d8..6ffa591d9c255a 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1298,22 +1298,22 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
 
     def test_generate_with_prompt_ids_max_length(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.max_target_positions = 5
+        config.max_target_positions = 7
 
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         input_features = input_dict["input_features"]
-        prompt_ids = np.asarray(range(4))
-        sliced_prompt_ids = prompt_ids[1:]
-        sliced_prompt_ids = sliced_prompt_ids[-config.max_target_positions // 2 - 1 :]
-        max_new_tokens = 5
+        decoder_input_ids = torch.arange(5).to("cuda")
+        prompt_ids = decoder_input_ids[:4]
+        max_new_tokens = 8
 
         with self.assertRaisesRegex(
             ValueError,
-            f"The length of the sliced `prompt_ids` is {len(sliced_prompt_ids)}, and the `max_new_tokens` "
-            f"{max_new_tokens}. Thus, the combined length of the sliced `prompt_ids` and `max_new_tokens` is: "
-            f"{len(sliced_prompt_ids) + max_new_tokens}. This exceeds the `max_target_positions` of the Whisper model: "
-            f"{config.max_target_positions}. You should either reduce the length of your prompt, or reduce the "
-            f"value of `max_new_tokens`, so that their combined length is less that {config.max_target_positions}.",
+            f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
+            f"is {max_new_tokens}. Thus, the combined length of "
+            f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
+            f"`max_target_positions` of the Whisper model: {config.max_target_positions}. "
+            "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+            f"so that their combined length is less than {config.max_target_positions}."
         ):
             model.generate(input_features, max_new_tokens=max_new_tokens, prompt_ids=prompt_ids)
 

From eed02ae32178d5902605fe382906a3e5875fb38a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 15:48:07 +0000
Subject: [PATCH 05/25] fix fast tests

---
 src/transformers/models/whisper/generation_whisper.py | 6 +++---
 tests/models/whisper/test_modeling_whisper.py         | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index a2930032bbea95..3fff01f931f781 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -530,7 +530,7 @@ def generate(
         )
         # TODO(Sanchit) - passing `decoder_input_ids` is deprecated. One should use `prompt_ids` instead
         # This function should be be removed in v4.39
-        self._check_decoder_input_ids(prompt_ids=prompt_ids, is_shortform=is_shortform, kwargs=kwargs)
+        self._check_decoder_input_ids(prompt_ids=prompt_ids, init_tokens=init_tokens, is_shortform=is_shortform, kwargs=kwargs)
 
         # 3. Retrieve logits processors
         begin_index = len(init_tokens)
@@ -1143,7 +1143,7 @@ def retrieve_init_tokens(generation_config, config, kwargs):
         return init_tokens
 
     @staticmethod
-    def _check_decoder_input_ids(prompt_ids, is_shortform, kwargs):
+    def _check_decoder_input_ids(prompt_ids, init_tokens, is_shortform, kwargs):
         decoder_input_ids = kwargs.get("decoder_input_ids", None)
         if prompt_ids is not None and decoder_input_ids is not None:
             raise ValueError(f"Cannot pass both `prompt_ids`: {prompt_ids} and `decoder_input_ids`: {decoder_input_ids}. Passing `decoder_input_ids` is deprecated, consider not passing it.")
@@ -1353,7 +1353,7 @@ def _prepare_decoder_input_ids(
             if prompt_ids is not None and generation_config.prompt_condition_type == "all-segments":
                 prev_ids = prompt_ids
             else:
-                prev_ids = prev_start_of_text * one_tensor[0]
+                prev_ids = prev_start_of_text * one_tensor[0] if prev_start_of_text is not None else None
 
             prev_tokens = _pad_to_max_length(
                 active_segments,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 6ffa591d9c255a..f54b0a4b318b4c 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1258,7 +1258,7 @@ def test_generate_with_prompt_ids_and_task_and_language(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         input_features = input_dict["input_features"]
-        prompt_ids = np.arange(5)
+        prompt_ids = torch.arange(5).to(torch_device)
         language = "<|de|>"
         task = "translate"
         lang_id = 6
@@ -1281,7 +1281,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         input_features = input_dict["input_features"]
-        prompt_ids = np.asarray(range(5))
+        prompt_ids = torch.arange(5).to("cuda")
         forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
 
         output = model.generate(
@@ -1986,7 +1986,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         language = "de"
         expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
         prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt)
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
 
         output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
         text = processor.decode(output[0])

From 6ef129316ebb7d435a54678f544a367caa970946 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 18:47:23 +0000
Subject: [PATCH 06/25] Fix more

---
 .../models/whisper/generation_whisper.py      | 92 +++++++++++--------
 tests/models/whisper/test_modeling_whisper.py | 20 ++--
 2 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 3fff01f931f781..164f005915704c 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -16,7 +16,7 @@
 import math
 import warnings
 import zlib
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union, Iterator
 
 import numpy as np
 import torch
@@ -159,7 +159,7 @@ def _pad_to_max_length(current_segments, pad_token_id, padding="right", bos_toke
 
 
 class WhisperGenerationMixin:
-    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
+    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, input_length=0):
         """
         Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
         map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
@@ -200,7 +200,7 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
                 dim=2,
             )
 
-        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, input_length:]
         batch_size = timestamps.shape[0]
 
         if num_frames is not None:
@@ -579,7 +579,7 @@ def generate(
 
             if generation_config.return_token_timestamps and hasattr(generation_config, "alignment_heads"):
                 outputs["token_timestamps"] = self._extract_token_timestamps(
-                    outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames
+                    outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames, input_length=decoder_input_ids.shape[-1] - 1
                 )
 
             return outputs
@@ -751,12 +751,7 @@ def generate_with_fallback(
         for fallback_idx, temperature in enumerate(temperatures):
             generation_config.do_sample = temperature is not None and temperature > 0.0
 
-            if generation_config.do_sample:
-                generation_config.temperature = temperature
-            else:
-                # default
-                generation_config.temperature = 1.0
-
+            generation_config.temperature = temperature if generation_config.do_sample else 1.0
             generation_config.num_beams = kwargs.pop("num_beams", 1) if not generation_config.do_sample else 1
 
             # print(decoder_input_ids)
@@ -775,13 +770,13 @@ def generate_with_fallback(
             )
 
             # post-process sequence tokens and outputs to be in list form
-            sequence_tokens, seek_outputs = self._postprocess_outputs(
-                seek_outputs, return_token_timestamps, generation_config
+            seek_sequences, seek_outputs = self._postprocess_outputs(
+                seek_outputs=seek_outputs,
+                decoder_input_ids=decoder_input_ids,
+                return_token_timestamps=return_token_timestamps,
+                generation_config=generation_config,
             )
 
-            # remove all previously passed decoder input ids
-            seek_sequences = sequence_tokens[:, decoder_input_ids.shape[-1] :]
-
             # 6.7 Extract cut sequences from every sequence and check if fallback should be applied
             # Loop over each decoded audio individually as each decoding can be of a different length
             new_fallback_index_map = []
@@ -861,30 +856,33 @@ def _prepare_segments(prompt_ids, batch_size, prompt_condition_type, generation_
         return current_segments
 
 
-    def _postprocess_outputs(self, seek_outputs, return_token_timestamps, generation_config):
+    def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config):
+        # remove all previously passed decoder input ids
+        if isinstance(seek_outputs, torch.Tensor):
+            seek_outputs = seek_outputs[:, decoder_input_ids.shape[-1] :]
+            return seek_outputs, seek_outputs
+
         if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
             num_frames = getattr(generation_config, "num_frames", None)
             seek_outputs["token_timestamps"] = self._extract_token_timestamps(
-                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+                seek_outputs, generation_config.alignment_heads, num_frames=num_frames, input_length=decoder_input_ids.shape[-1] - 1
             )
 
-        if generation_config.return_dict_in_generate:
+        seek_outputs["sequences"] = seek_outputs["sequences"][:, decoder_input_ids.shape[-1] :]
 
-            def split_by_batch_index(values, key, batch_idx):
-                if key == "scores":
-                    return [v[batch_idx].cpu() for v in values]
-                if key == "past_key_values":
-                    # we don't save `past_key_values` as this is too costly
-                    return None
-                return values[batch_idx].cpu()
+        def split_by_batch_index(values, key, batch_idx):
+            if key == "scores":
+                return [v[batch_idx].cpu() for v in values]
+            if key == "past_key_values":
+                # we don't save `past_key_values` as this is too costly
+                return None
+            return values[batch_idx].cpu()
 
-            sequence_tokens = seek_outputs["sequences"]
-            seek_outputs = [
-                {k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()}
-                for i in range(sequence_tokens.shape[0])
+        sequence_tokens = seek_outputs["sequences"]
+        seek_outputs = [
+            {k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()}
+            for i in range(sequence_tokens.shape[0])
             ]
-        else:
-            sequence_tokens = seek_outputs
 
         return sequence_tokens, seek_outputs
 
@@ -1065,6 +1063,16 @@ def _set_language_and_task(language, task, is_multilingual, generation_config):
 
     @staticmethod
     def retrieve_init_tokens(generation_config, config, kwargs):
+
+        def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
+            """ short function to replace num with a itr in lst """
+            found = any(i in lst for i in itr)
+            if found:
+                lst = [num if i in itr else i for i in lst]
+            else:
+                lst.append(num)
+            return lst
+
         forced_decoder_ids = None
         # Legacy code for backward compatibility
         if hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
@@ -1119,26 +1127,32 @@ def retrieve_init_tokens(generation_config, config, kwargs):
                     f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
                     "(You should just add it to the generation config)"
                 )
-            init_tokens.append(generation_config.lang_to_id[language_token])
-        elif task is not None:
-            # if task was passed, but language was not set, default to English (first lang token)
-            language_token = generation_config.decoder_start_token_id + 1
 
+            lang_id = generation_config.lang_to_id[language_token]
+
+            # if language is defined it'll overwrite language ids that might have already been defined via the generation_config
+            replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
 
         if task is not None:
             if task in TASK_IDS:
                 init_tokens.append(generation_config.task_to_id[generation_config.task])
+                task_id = generation_config.task_to_id[generation_config.task]
+
+                # if task is defined it'll overwrite task ids that might have already been defined via the generation_config
+                replace_or_add(init_tokens, task_id, generation_config.task_to_id.values())
             else:
                 raise ValueError(
                     f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`"
                 )
-        elif hasattr(generation_config, "task_to_id"):
-            init_tokens.append(generation_config.task_to_id["transcribe"])  # defaults to transcribe
+        elif language is not None and hasattr(generation_config, "task_to_id"):
+            # if language is defined, but no task id is in `init_tokens`, default to transcribe
+            if not any(i in init_tokens for i in generation_config.task_to_id.values()):
+                init_tokens.append(generation_config.task_to_id["transcribe"])
 
-        if not generation_config.return_timestamps and hasattr(generation_config, "no_timestamps_token_id"):
+        if not generation_config.return_timestamps and hasattr(generation_config, "no_timestamps_token_id") and init_tokens[-1] != generation_config.no_timestamps_token_id:
             init_tokens.append(generation_config.no_timestamps_token_id)
         elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
-            init_tokens = init_tokens[-1:]
+            init_tokens = init_tokens[:-1]
 
         return init_tokens
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f54b0a4b318b4c..dd704968b327ad 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1919,7 +1919,8 @@ def test_tiny_token_timestamp_batch_generation(self):
             num_return_sequences=num_return_sequences,
         )
 
-        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+        # task id and lang id prompts should not have timestamp tokens
+        self.assertEqual(generate_outputs.sequences.shape[-1] - 2, generate_outputs.token_timestamps.shape[-1])
 
         self.assertEqual(len(generate_outputs.sequences), num_return_sequences * num_samples)
 
@@ -1967,13 +1968,17 @@ def test_generate_with_prompt_ids(self):
         input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
 
         output_without_prompt = model.generate(input_features)
-        prompt_ids = processor.get_prompt_ids("Leighton")
+        prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="pt").to("cuda")
         output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
 
         expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
         expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
-        self.assertEqual(processor.decode(output_without_prompt[0]), expected_without_prompt)
-        self.assertEqual(processor.decode(output_with_prompt[0]), expected_with_prompt)
+
+        output_without_prompt = processor.decode(output_without_prompt[0])
+        output_with_prompt = processor.decode(output_with_prompt[0])
+
+        self.assertEqual(output_without_prompt, expected_without_prompt)
+        self.assertEqual(output_with_prompt, expected_with_prompt)
 
     @slow
     def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
@@ -1986,7 +1991,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         language = "de"
         expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
         prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
 
         output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
         text = processor.decode(output[0])
@@ -2002,7 +2007,7 @@ def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
         input_speech = self._load_datasamples(1)
         input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
         prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt)
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
 
         model.generation_config.forced_decoder_ids = None
         model.config.forced_decoder_ids = None
@@ -2176,7 +2181,8 @@ def test_whisper_longform_single_batch_prev_cond(self):
         result = model.generate(input_features, **gen_kwargs)
         decoded = processor.batch_decode(result, skip_special_tokens=True)
 
-        assert decoded == EXPECTED_TEXT
+        import ipdb; ipdb.set_trace()
+        assert decoded[0] == EXPECTED_TEXT[0]
 
     @slow
     def test_whisper_longform_multi_batch(self):

From e2a124af00c531834c894ba90bbedc04148007d4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 18:47:29 +0000
Subject: [PATCH 07/25] fix more

---
 .../models/whisper/generation_whisper.py      | 18 ++++++++++++------
 tests/models/whisper/test_modeling_whisper.py | 19 +++++++++++--------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 164f005915704c..25449598141519 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -159,7 +159,7 @@ def _pad_to_max_length(current_segments, pad_token_id, padding="right", bos_toke
 
 
 class WhisperGenerationMixin:
-    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, input_length=0):
+    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
         """
         Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
         map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
@@ -200,7 +200,9 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
                 dim=2,
             )
 
-        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, input_length:]
+        # make sure timestamps are as long as cross_attention
+        input_length = cross_attentions[0].shape[2] + 1
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, :input_length]
         batch_size = timestamps.shape[0]
 
         if num_frames is not None:
@@ -579,7 +581,7 @@ def generate(
 
             if generation_config.return_token_timestamps and hasattr(generation_config, "alignment_heads"):
                 outputs["token_timestamps"] = self._extract_token_timestamps(
-                    outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames, input_length=decoder_input_ids.shape[-1] - 1
+                    outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames
                 )
 
             return outputs
@@ -865,7 +867,7 @@ def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_tim
         if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
             num_frames = getattr(generation_config, "num_frames", None)
             seek_outputs["token_timestamps"] = self._extract_token_timestamps(
-                seek_outputs, generation_config.alignment_heads, num_frames=num_frames, input_length=decoder_input_ids.shape[-1] - 1
+                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
             )
 
         seek_outputs["sequences"] = seek_outputs["sequences"][:, decoder_input_ids.shape[-1] :]
@@ -1086,10 +1088,10 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
         language = getattr(generation_config, "language", None)
 
         if forced_decoder_ids is not None and task is not None:
-            logger.warn(f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. Make sure to either remove `forced_decoder_ids` from your `generation_config` or don't set `task`. `forced_decoder_ids` will be ignored in favor of task={task}.")
+            logger.info(f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}.")
             forced_decoder_ids = None
         elif forced_decoder_ids is not None and language is not None:
-            logger.warn(f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. Make sure to either remove `forced_decoder_ids` from your `generation_config` or don't set `language`. `forced_decoder_ids` will be ignored in favor of language={language}.")
+            logger.info(f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}.")
             forced_decoder_ids = None
 
         init_tokens = [generation_config.decoder_start_token_id]
@@ -1132,6 +1134,10 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
             # if language is defined it'll overwrite language ids that might have already been defined via the generation_config
             replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
+        elif task is not None and hasattr(generation_config, "lang_to_id"):
+            # default to English
+            lang_id = generation_config.decoder_start_token_id + 1  # start_token_id + 1 is <en>
+            replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
 
         if task is not None:
             if task in TASK_IDS:
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index dd704968b327ad..7a681cba0c974e 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -2153,7 +2153,7 @@ def test_whisper_longform_single_batch(self):
     @slow
     def test_whisper_longform_single_batch_prev_cond(self):
         # fmt: off
-        EXPECTED_TEXT = [""" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says like a shampooer and a Turkish bath, next man it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M. A. A man said to the universe, Sir, I exist. Sweat covered Breon's body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retroveilities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. But there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Your man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Breon's death was in some ways easier than defeat. Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that's rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggido long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong, and pounded on it, just as we're good to be used to do. But no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong, and then sat in the throne, wearing Regidos discarded Ruby Crown, and holding in his hand to scepter, which Regidos had so often thrown at his head."""]
+        EXPECTED_TEXT = [""" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says like a shampooer and a Turkish bath, next man it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M. A. A man said to the universe, Sir, I exist. Sweat covered Breon's body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retroveilities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. But there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Your man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Breon's death was in some ways easier than defeat. Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that's rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggido long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as we're good to be used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Regidos discarded Ruby crown, and holding in his hand to scepter which Regidos had so often thrown at his head."""]
         # fmt: on
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -2181,8 +2181,7 @@ def test_whisper_longform_single_batch_prev_cond(self):
         result = model.generate(input_features, **gen_kwargs)
         decoded = processor.batch_decode(result, skip_special_tokens=True)
 
-        import ipdb; ipdb.set_trace()
-        assert decoded[0] == EXPECTED_TEXT[0]
+        assert decoded == EXPECTED_TEXT
 
     @slow
     def test_whisper_longform_multi_batch(self):
@@ -2236,10 +2235,10 @@ def test_whisper_longform_multi_batch(self):
     @slow
     def test_whisper_longform_multi_batch_prev_cond(self):
         # fmt: off
-        EXPECTED_TEXT_1 = [" Mr. Quilters manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca. The Nils, pictures are sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilters writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are of two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does get good. Mr. Quilters has missed his chance, for he has failed even to make himself the tougher of painting. My hair equal to M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment he wore. The cut on his chest still dripping blood. The ache of his overstrain dyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly, but that away, you're being a fool. Out, the resoundance then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. Our red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were inexplicably linked into one. This strengthened enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the other hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our role. Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to the side, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help you run into escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now inquired shaggy in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to Bedsey thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Ruggano used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggano's discarded ruby crown. And holding in his hand the scepter which Ruggano had so often thrown at his head."]
+        EXPECTED_TEXT_1 = [" Mr. Quilters manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca. The Nils, pictures are sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilters writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are of two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does get good. Mr. Quilters has missed his chance, for he has failed even to make himself the tougher of painting. My hair equal to M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment he wore. The cut on his chest still dripping blood. The ache of his overstrain dyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance. And brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly. But that away, he'd be no fool. Out, the resoundance then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were inexplicably linked into one. This strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the other hypnotic phrases that triggered the process. In the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our role. Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to the side, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help you run into escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now inquired shaggy in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to Bedsey thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Ruggano used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggano's discarded ruby crown. And holding in his hand the scepter which Ruggano had so often thrown at his head."]
         EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennials, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker"]
-        EXPECTED_TEXT_3 = [" gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating in its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky ithaka. Lennils, pictures, are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostoror. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and falseness graced that many phases of feeling, only unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tougher of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes. Even the soaring arena around him with thousands of spectators were trivealed, not worth thinking about. His instant panic was followed by a small sharp, blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie sliding out on the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The 20s, he must have drawn his gun because the intruder said quickly, but that away, he'll be in the fool. Out, there is silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the autohydrotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled up the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our ol' Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to decide, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Whereas my brother now, in Quaragejjegi, in the metal forest. Where is that? The metal forest is in the great Dome to Cavern, the largest and all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny remarked by the bad sea thoughtfully. I don't believe Anne knew any magic or she'd have worked it before. I do not know, confessed shaggy. True, a great Calico. Calico went to the big gong and pounded on it, just as we're good or used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown, and holding in his hand to scepter which reggos had so often thrown at his head."]
-        EXPECTED_TEXT_4 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennils, pictures, are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does, get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tougher of painting. By Harry Quilter, M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators were trivialities not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly, but that away, you're being a fool. Out, there is silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inexplicably linked into one. Just strengthed and enabled someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the autohydrotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled up the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our ol' Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to the side, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. She has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace and your friends are asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, in Quaragejji, in the metal forest? Where is that? The metal forest is in the great Dome to Cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked a bit, see you thoughtfully. I don't believe Anne knew any magic or she'd have worked it before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as we're good we used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown and holding it his hand to scepter which reggo had so often thrown at his head."]
+        EXPECTED_TEXT_3 = [" gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating in its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky ithaka. Lennils, pictures, are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostoror. Near the fire, any ornaments spread brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many faces are feeling, only unfortunately his own work never does get good. Mr. Quilter has missed his chance. For he has failed even to make himself the tougher of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat covered Brienne's body trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes. Even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered his muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding out on the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The 20s, he must have drawn his gun because the intruder said quickly, but that away here being a fool. Out, there is silence then, and still wondering, Brienne was once more asleep. 10 seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were anextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne's softly spoke the odd hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled up the maze at the sudden fury of the attack, then smiled. He said it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our ol' Brienne sensed it and knew the fifth point was his. Then the powerful twist that's right to decide, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they're asking for you. I begged Brienne to long ago to send him away, but he would not do so. I also offered to help you brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now inquired Shaggy in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to bed see you thoughtfully. I don't believe Anne knew any magic or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gone and pounded on it, just as we're good or used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gone and then sat in the throne, wearing reggos, discarded ruby crown, and holding in his hand to scepter which reggos hand so often thrown at his head."]
+        EXPECTED_TEXT_4 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennils, pictures, are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does, get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tougher of painting. By Harry Quilter, M.A. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment you wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators were trivialities not worth thinking about. His instant panic was followed by a small sharp blow, high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance. And brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly, but that away, he could be no fool. Out, there was silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the other hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from Irohog. Brienne sensed it and knew the fifth point was his. Then the powerful twist that's for us to decide, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stoutchanges as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they are asking for you. I begged Ruggano a long ago to send him away, but he would not do so. I also offered to help you brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. And exactly we've turned Calico, where is my brother now in Quaragejji, in the metal forest? Where is that? The metal forest is in the great donned cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked to Bedzeeth thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as we're good to have used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown. And holding in his hand to scepter which reggos had so often thrown at his head."]
         # fmt: on
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
@@ -2334,8 +2333,8 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self):
             " Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle, and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ickel Greg Waferandi, who carefully died them in a pallet of bright, zesty shades, and adorn them in the finest most topical inlay work, using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddle stitching, and line it with bees, wax, coated linen, and finally attach a mallet hammered strap, perled hardware, and close-shet to create for you the one of a kind hope, kutur, earn-may is burkin bag that is my monologue, but sometimes, sometimes, sometimes. Sometimes, sometimes I wake up in the last car of an abandoned roller coaster at Kony Island, where I'm hiding from the triads, I have some engine lubricants out of a safe way bag and staggered down the shore to tear the sail off a beach sooner than I ripped the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel Lovelyfokes, and use it to stitch the sail into a loose pouch like rock sack, and I stole a bag of a garbage truck to the junkyard, where I picked through to the debris for only the broken toys that make me the saddest, until I have loaded for you. The hobo fugitives bug out Bindle of news that is my segment.",
             " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui, to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue, but sometimes just sometimes, I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself and use fry oil, wrap my hands and some old duct tape I stole from a broken car window, pound a six pack of blueberry hard-seller and a second pill, as I stole from a park damsel, and it's then arm wrestle a raccoon in the back alley vision quest of news that is my segment.",
             " You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press, black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards and a face plate, and finally using fluted strips of white alloyed molding I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes, folks. Sometimes, just sometimes, I come to my senses fully naked on the deck of a pirate, beceived, melee, container ship that picked me up floating on the detainees. Then after I sunstroke in juice, realization of the crew of this ship plans to sell me and exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe in a pool chain that accepting my new role as captain and declaring myself king of the wind arc seas. I grab a dirty muck bucket covered in barnacles and a dornet with the teeth of the vanquished to create the softening wet pirate crown of news that is my segment. I'm going to use the white paper to create the softened white paper to create the softened white paper to create the softened white pirate crown of news that is my segment. Meanwhile.",
-            " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks I wake up in the baggage hole of Greyhound bus, it's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants and as ovenmets to extract and serve the demented transients pound cake of news that is my segment. Me wild!",
-            " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from it a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cole and extra. Sloering the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of transformation as a half horse, half man freak, seasons to life before me. And the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile.",
+            " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks I wake up in the baggage hole of Greyhound bus, it's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants and as ovenmets to extract and serve the demented transients pound cake of news that is my segment.",
+            " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from it a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cole and extra. Sloering the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of transformation as a half horse, half man freak, seasons to life before me. And the hideous collection of loose animal parts and corrupted men tissue that is my segment.",
         ]
         # fmt: on
 
@@ -2370,6 +2369,10 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self):
         result = model.generate(**inputs, **gen_kwargs)
         decoded_all = processor.batch_decode(result, skip_special_tokens=True)
 
+        with open(f"file_all.txt", "w") as f:
+            for line in decoded_all:
+                f.write(line + "\n")
+
         for i in range(num_samples):
             assert decoded_all[i] == EXPECTED_TEXT[i]
 

From 00663776576c566e4d8c9f99740475d63ea40e0b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 19:36:56 +0000
Subject: [PATCH 08/25] push all files

---
 src/transformers/models/whisper/generation_whisper.py | 9 ---------
 tests/models/whisper/test_modeling_whisper.py         | 4 ----
 2 files changed, 13 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 25449598141519..fdd1a63778ad88 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -41,11 +41,6 @@
 logger = logging.get_logger(__name__)
 
 
-from transformers import AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained("openai/whisper-medium.en")
-
-
 def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
     """
     Applies a median filter of width `filter_width` along the last dimension of the input.
@@ -756,10 +751,6 @@ def generate_with_fallback(
             generation_config.temperature = temperature if generation_config.do_sample else 1.0
             generation_config.num_beams = kwargs.pop("num_beams", 1) if not generation_config.do_sample else 1
 
-            # print(decoder_input_ids)
-            print(tok.batch_decode(decoder_input_ids, skip_special_tokens=False))
-            print(temperature)
-
             seek_outputs = super().generate(
                 segment_input,
                 generation_config,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 7a681cba0c974e..fefeea9cfe4343 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -2369,10 +2369,6 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self):
         result = model.generate(**inputs, **gen_kwargs)
         decoded_all = processor.batch_decode(result, skip_special_tokens=True)
 
-        with open(f"file_all.txt", "w") as f:
-            for line in decoded_all:
-                f.write(line + "\n")
-
         for i in range(num_samples):
             assert decoded_all[i] == EXPECTED_TEXT[i]
 

From 84d16eab6ea73ca971698b798cda67eda13a1873 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 20:42:33 +0000
Subject: [PATCH 09/25] finish all

---
 .../models/whisper/generation_whisper.py            | 13 ++++++++-----
 .../test_pipelines_automatic_speech_recognition.py  |  9 ++-------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index fdd1a63778ad88..64f3e2b5346d42 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -477,7 +477,7 @@ def generate(
         # 2. set global generate variables
         input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
         num_segment_frames = input_stride * self.config.max_source_positions
-        total_input_frames = self._retrieve_total_input_frames(
+        batch_size, total_input_frames = self._retrieve_total_input_frames(
             input_features=input_features, input_stride=input_stride, kwargs=kwargs
         )
         is_shortform = total_input_frames <= num_segment_frames
@@ -546,11 +546,11 @@ def generate(
 
             decoder_input_ids = kwargs.pop("decoder_input_ids", None)
             if decoder_input_ids is None:
-                one_tensor = torch.ones((input_features.shape[0], 1), device=input_features.device, dtype=torch.long)
+                one_tensor = torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
                 decoder_input_ids = torch.cat([t * one_tensor for t in init_tokens], dim=-1)
 
             if prompt_ids is not None:
-                decoder_input_ids = torch.cat([prompt_ids[None].repeat(input_features.shape[0], 1), decoder_input_ids], dim=-1)
+                decoder_input_ids = torch.cat([prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1)
 
             if kwargs.get("max_new_tokens", 0) + decoder_input_ids.shape[-1] > self.config.max_target_positions:
                 max_new_tokens = kwargs.get("max_new_tokens", 0)
@@ -932,7 +932,7 @@ def _setup_no_speech_detection(logits_processor, segment_input, decoder_input_id
     @staticmethod
     def _retrieve_total_input_frames(input_features, input_stride, kwargs):
         if input_features is not None:
-            return input_features.shape[-1]
+            return input_features.shape[0], input_features.shape[-1]
 
         if "encoder_outputs" in kwargs:
             encoder_outputs_shape = (
@@ -940,7 +940,7 @@ def _retrieve_total_input_frames(input_features, input_stride, kwargs):
                 if isinstance(kwargs["encoder_outputs"], BaseModelOutput)
                 else kwargs["encoder_outputs"].shape
             )
-            return encoder_outputs_shape[1] * input_stride
+            return encoder_outputs_shape[0], encoder_outputs_shape[1] * input_stride
 
         raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.")
 
@@ -1151,6 +1151,9 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
         elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
             init_tokens = init_tokens[:-1]
 
+        # let's make sure we don't pass `None` tokens as prompt tokens
+        init_tokens = [t for t in init_tokens if t is not None]
+
         return init_tokens
 
     @staticmethod
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 7b6a9f30c55ac9..f560b95fd0b7f9 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1451,6 +1451,7 @@ def test_slow_unfinished_sequence(self):
         # Original model wasn't trained with timestamps and has incorrect generation config
         pipe.model.generation_config = GenerationConfig.from_pretrained("openai/whisper-large-v2")
 
+        # the audio is 4 seconds long
         audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
 
         out = pipe(
@@ -1459,13 +1460,7 @@ def test_slow_unfinished_sequence(self):
         )
         self.assertEqual(
             out,
-            {
-                "chunks": [
-                    {"text": "", "timestamp": (18.94, 0.02)},
-                    {"text": "मिर्ची में कितने विभिन्न प्रजातियां हैं", "timestamp": (None, None)},
-                ],
-                "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
-            },
+            {'text': 'मिर्ची में कितने विभिन्न प्रजातियां हैं', 'chunks': [{'timestamp': (0.26, None), 'text': 'मिर्ची में कितने विभिन्न प्रजातियां हैं'}]},
         )
 
 

From 4755bd3304f3db070b0f22f8fcfda350ba202cc9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 25 Jan 2024 21:01:04 +0000
Subject: [PATCH 10/25] make style

---
 .../run_image_classification_no_trainer.py    |  4 +-
 .../image-pretraining/run_mim_no_trainer.py   |  4 +-
 .../language-modeling/run_clm_no_trainer.py   |  4 +-
 .../language-modeling/run_mlm_no_trainer.py   |  4 +-
 .../multiple-choice/run_swag_no_trainer.py    |  4 +-
 .../run_qa_beam_search_no_trainer.py          |  4 +-
 .../question-answering/run_qa_no_trainer.py   |  4 +-
 .../run_semantic_segmentation_no_trainer.py   |  4 +-
 .../run_summarization_no_trainer.py           |  4 +-
 .../models/whisper/generation_whisper.py      | 73 ++++++++++++-------
 tests/models/whisper/test_modeling_whisper.py |  2 +-
 ..._pipelines_automatic_speech_recognition.py |  5 +-
 12 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 7c3aa725ea46ed..fa28a2a0d103b3 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -439,7 +439,9 @@ def collate_fn(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 6d5c1849e5b3b2..4bc8b22d5ae14f 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -627,7 +627,9 @@ def preprocess_images(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index a8e9b608e466d9..15d513b0c928b6 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -527,7 +527,9 @@ def group_texts(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 97860cd2666abb..d9b8120a98e883 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -564,7 +564,9 @@ def group_texts(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 533072fc0af15a..9ad72548329184 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -511,7 +511,9 @@ def preprocess_function(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 65c38edca295df..905189d0d41a88 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -751,7 +751,9 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 5cb00d5225cb8f..1a58f6ce442f0e 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -781,7 +781,9 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 315c0ce4611cd2..0ba6c5957d1530 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -514,7 +514,9 @@ def preprocess_val(example_batch):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 3212ef1af52b04..96ccb552ed164f 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -581,7 +581,9 @@ def postprocess_text(preds, labels):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps
+        if overrode_max_train_steps
+        else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 64f3e2b5346d42..3dd44282d8ef6d 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -16,7 +16,7 @@
 import math
 import warnings
 import zlib
-from typing import Callable, List, Optional, Tuple, Union, Iterator
+from typing import Callable, Iterator, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -527,14 +527,16 @@ def generate(
         )
         # TODO(Sanchit) - passing `decoder_input_ids` is deprecated. One should use `prompt_ids` instead
         # This function should be be removed in v4.39
-        self._check_decoder_input_ids(prompt_ids=prompt_ids, init_tokens=init_tokens, is_shortform=is_shortform, kwargs=kwargs)
+        self._check_decoder_input_ids(
+            prompt_ids=prompt_ids, init_tokens=init_tokens, is_shortform=is_shortform, kwargs=kwargs
+        )
 
         # 3. Retrieve logits processors
         begin_index = len(init_tokens)
         logits_processor = self._retrieve_logit_processors(
             generation_config=generation_config,
             logits_processor=logits_processor,
-            begin_index=begin_index, # begin index is index of first generated decoder token
+            begin_index=begin_index,  # begin index is index of first generated decoder token
             is_shortform=is_shortform,
             num_beams=kwargs.get("num_beams", 1),
         )
@@ -550,7 +552,9 @@ def generate(
                 decoder_input_ids = torch.cat([t * one_tensor for t in init_tokens], dim=-1)
 
             if prompt_ids is not None:
-                decoder_input_ids = torch.cat([prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1)
+                decoder_input_ids = torch.cat(
+                    [prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1
+                )
 
             if kwargs.get("max_new_tokens", 0) + decoder_input_ids.shape[-1] > self.config.max_target_positions:
                 max_new_tokens = kwargs.get("max_new_tokens", 0)
@@ -600,7 +604,12 @@ def generate(
 
         # 6.2 Preppare running variables, list for generation
         cur_bsz = batch_size
-        current_segments = self._prepare_segments(prompt_ids=prompt_ids, batch_size=batch_size, prompt_condition_type=prompt_condition_type, generation_config=generation_config)
+        current_segments = self._prepare_segments(
+            prompt_ids=prompt_ids,
+            batch_size=batch_size,
+            prompt_condition_type=prompt_condition_type,
+            generation_config=generation_config,
+        )
 
         batch_idx_map = list(range(batch_size))
         do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(batch_size)]
@@ -707,7 +716,11 @@ def generate(
 
         # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
         # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
-        final_segments = [x[1:] for x in current_segments] if (prompt_ids is not None and prompt_condition_type == "first-segment") else current_segments
+        final_segments = (
+            [x[1:] for x in current_segments]
+            if (prompt_ids is not None and prompt_condition_type == "first-segment")
+            else current_segments
+        )
         sequences = _pad_to_max_length(final_segments, generation_config.pad_token_id, padding="right")
 
         # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
@@ -832,7 +845,6 @@ def generate_with_fallback(
 
         return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens
 
-
     @staticmethod
     def _prepare_segments(prompt_ids, batch_size, prompt_condition_type, generation_config):
         generation_config.prompt_condition_type = prompt_condition_type
@@ -844,11 +856,12 @@ def _prepare_segments(prompt_ids, batch_size, prompt_condition_type, generation_
             current_segments = [[] for _ in range(batch_size)]
 
         if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
-            raise ValueError("Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`.")
+            raise ValueError(
+                "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`."
+            )
 
         return current_segments
 
-
     def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config):
         # remove all previously passed decoder input ids
         if isinstance(seek_outputs, torch.Tensor):
@@ -875,7 +888,7 @@ def split_by_batch_index(values, key, batch_idx):
         seek_outputs = [
             {k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()}
             for i in range(sequence_tokens.shape[0])
-            ]
+        ]
 
         return sequence_tokens, seek_outputs
 
@@ -1056,9 +1069,8 @@ def _set_language_and_task(language, task, is_multilingual, generation_config):
 
     @staticmethod
     def retrieve_init_tokens(generation_config, config, kwargs):
-
         def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
-            """ short function to replace num with a itr in lst """
+            """short function to replace num with a itr in lst"""
             found = any(i in lst for i in itr)
             if found:
                 lst = [num if i in itr else i for i in lst]
@@ -1079,10 +1091,14 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
         language = getattr(generation_config, "language", None)
 
         if forced_decoder_ids is not None and task is not None:
-            logger.info(f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}.")
+            logger.info(
+                f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}."
+            )
             forced_decoder_ids = None
         elif forced_decoder_ids is not None and language is not None:
-            logger.info(f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}.")
+            logger.info(
+                f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
+            )
             forced_decoder_ids = None
 
         init_tokens = [generation_config.decoder_start_token_id]
@@ -1138,15 +1154,17 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
                 # if task is defined it'll overwrite task ids that might have already been defined via the generation_config
                 replace_or_add(init_tokens, task_id, generation_config.task_to_id.values())
             else:
-                raise ValueError(
-                    f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`"
-                )
+                raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`")
         elif language is not None and hasattr(generation_config, "task_to_id"):
             # if language is defined, but no task id is in `init_tokens`, default to transcribe
             if not any(i in init_tokens for i in generation_config.task_to_id.values()):
                 init_tokens.append(generation_config.task_to_id["transcribe"])
 
-        if not generation_config.return_timestamps and hasattr(generation_config, "no_timestamps_token_id") and init_tokens[-1] != generation_config.no_timestamps_token_id:
+        if (
+            not generation_config.return_timestamps
+            and hasattr(generation_config, "no_timestamps_token_id")
+            and init_tokens[-1] != generation_config.no_timestamps_token_id
+        ):
             init_tokens.append(generation_config.no_timestamps_token_id)
         elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
             init_tokens = init_tokens[:-1]
@@ -1160,11 +1178,18 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
     def _check_decoder_input_ids(prompt_ids, init_tokens, is_shortform, kwargs):
         decoder_input_ids = kwargs.get("decoder_input_ids", None)
         if prompt_ids is not None and decoder_input_ids is not None:
-            raise ValueError(f"Cannot pass both `prompt_ids`: {prompt_ids} and `decoder_input_ids`: {decoder_input_ids}. Passing `decoder_input_ids` is deprecated, consider not passing it.")
+            raise ValueError(
+                f"Cannot pass both `prompt_ids`: {prompt_ids} and `decoder_input_ids`: {decoder_input_ids}. Passing `decoder_input_ids` is deprecated, consider not passing it."
+            )
         elif decoder_input_ids is not None and not is_shortform:
-            raise ValueError(f"Cannot pass both `decoder_input_ids`: {decoder_input_ids} for long-form generation. Consider passing `prompt_ids` instead.")
+            raise ValueError(
+                f"Cannot pass both `decoder_input_ids`: {decoder_input_ids} for long-form generation. Consider passing `prompt_ids` instead."
+            )
         elif decoder_input_ids is not None and is_shortform:
-            warnings.warn(f"You have provided `decoder_input_ids` which will overwrite the `init_tokens` {init_tokens}. This might lead to unexpected behavior. Passing `decoder_input_ids` is deprecated and will be removed in v4.39. Consider passing `prompt_ids` instead.", FutureWarning)
+            warnings.warn(
+                f"You have provided `decoder_input_ids` which will overwrite the `init_tokens` {init_tokens}. This might lead to unexpected behavior. Passing `decoder_input_ids` is deprecated and will be removed in v4.39. Consider passing `prompt_ids` instead.",
+                FutureWarning,
+            )
 
     @staticmethod
     def _set_token_ids(generation_config, config, kwargs):
@@ -1247,9 +1272,7 @@ def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames
 
         return max_frames, seek
 
-    def _retrieve_logit_processors(
-        self, generation_config, logits_processor, begin_index, is_shortform, num_beams
-    ):
+    def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, is_shortform, num_beams):
         if generation_config.return_timestamps is True:
             timestamp_processor = WhisperTimeStampLogitsProcessor(generation_config, begin_index=begin_index)
             logits_processor = (
@@ -1380,7 +1403,7 @@ def _prepare_decoder_input_ids(
 
             kwargs["decoder_attention_mask"] = decoder_input_ids != generation_config.pad_token_id
         elif prompt_ids is not None:
-            prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1) 
+            prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1)
             decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
             # make sure `"decoder_attention_mask"` is not passed to forward
             kwargs.pop("decoder_attention_mask", None)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index fefeea9cfe4343..17f23a263283a8 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1313,7 +1313,7 @@ def test_generate_with_prompt_ids_max_length(self):
             f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
             f"`max_target_positions` of the Whisper model: {config.max_target_positions}. "
             "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
-            f"so that their combined length is less than {config.max_target_positions}."
+            f"so that their combined length is less than {config.max_target_positions}.",
         ):
             model.generate(input_features, max_new_tokens=max_new_tokens, prompt_ids=prompt_ids)
 
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index f560b95fd0b7f9..e99f8d6862e386 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1460,7 +1460,10 @@ def test_slow_unfinished_sequence(self):
         )
         self.assertEqual(
             out,
-            {'text': 'मिर्ची में कितने विभिन्न प्रजातियां हैं', 'chunks': [{'timestamp': (0.26, None), 'text': 'मिर्ची में कितने विभिन्न प्रजातियां हैं'}]},
+            {
+                "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
+                "chunks": [{"timestamp": (0.26, None), "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं"}],
+            },
         )
 
 

From 42f50c140dfcefd14fb59653c9c2d32bc93980fc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 07:14:27 +0000
Subject: [PATCH 11/25] Fix timestamp wrap

---
 src/transformers/models/whisper/generation_whisper.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 3dd44282d8ef6d..659192bfe3311b 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -174,6 +174,8 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
         weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
         weights = weights.permute([1, 0, 2, 3])
 
+        weight_length = None
+
         if "beam_indices" in generate_outputs:
             # If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths
             # since the beam search strategy chooses the most probable sequences at the end of the search.
@@ -195,9 +197,9 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
                 dim=2,
             )
 
-        # make sure timestamps are as long as cross_attention
-        input_length = cross_attentions[0].shape[2] + 1
-        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, :input_length]
+        # make sure timestamps are as long as weights
+        input_length = weight_length or cross_attentions[0].shape[2]
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, :input_length + 1]
         batch_size = timestamps.shape[0]
 
         if num_frames is not None:

From 61756745363346136d6f7f3ae28338b58b4b3eaa Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 07:15:00 +0000
Subject: [PATCH 12/25] make style

---
 src/transformers/models/whisper/generation_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 659192bfe3311b..1b013c50981441 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -199,7 +199,7 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
 
         # make sure timestamps are as long as weights
         input_length = weight_length or cross_attentions[0].shape[2]
-        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, :input_length + 1]
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, : input_length + 1]
         batch_size = timestamps.shape[0]
 
         if num_frames is not None:

From 58ee94f4fa65371723c2dc6a91b8390614119f58 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 08:39:02 +0000
Subject: [PATCH 13/25] make style

---
 .../models/whisper/generation_whisper.py      | 48 +++++++---
 tests/models/whisper/test_modeling_whisper.py | 90 +++++++++++++++++++
 2 files changed, 124 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 1b013c50981441..6574c87b4cda56 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -264,7 +264,7 @@ def generate(
         language: Optional[str] = None,
         is_multilingual: Optional[bool] = None,
         prompt_ids: Optional[torch.Tensor] = None,
-        prompt_condition_type: str = "first-segment",  # first-segment, all-segments
+        prompt_condition_type: Optional[str] = None,  # first-segment, all-segments
         condition_on_prev_tokens: Optional[bool] = None,
         temperature: Optional[Union[float, Tuple[float, ...]]] = None,
         compression_ratio_threshold: Optional[float] = None,
@@ -338,6 +338,9 @@ def generate(
                 provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
                 transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
                 correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            prompt_condition_type (`str`, *optional*):
+                Only relevant for long-form transcription. Condition type of `prompt_ids`. 'first-segment' means only the first segment is conditioned on `prompt_ids`. 'all-segments' means each segment is conditioned on `prompt_ids`. Make sure to enable `condition_on_prev_tokens` for 'all-segments'.
+                Defaults to 'first-segment'. For short-term transcription only 'first-segment' is possible.
             condition_on_prev_tokens (`bool`, *optional*):
                 Only relevant for long-form transcription. Whether to condition each segment on the previous segment.
                 As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
@@ -521,6 +524,11 @@ def generate(
             no_speech_threshold=no_speech_threshold,
             condition_on_prev_tokens=condition_on_prev_tokens,
         )
+        self._set_prompt_condition_type(
+            generation_config=generation_config,
+            prompt_condition_type=prompt_condition_type,
+        )
+
         # pass self.config for backward compatibility
         init_tokens = self.retrieve_init_tokens(
             generation_config=generation_config,
@@ -609,7 +617,6 @@ def generate(
         current_segments = self._prepare_segments(
             prompt_ids=prompt_ids,
             batch_size=batch_size,
-            prompt_condition_type=prompt_condition_type,
             generation_config=generation_config,
         )
 
@@ -720,7 +727,7 @@ def generate(
         # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
         final_segments = (
             [x[1:] for x in current_segments]
-            if (prompt_ids is not None and prompt_condition_type == "first-segment")
+            if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
             else current_segments
         )
         sequences = _pad_to_max_length(final_segments, generation_config.pad_token_id, padding="right")
@@ -848,20 +855,14 @@ def generate_with_fallback(
         return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens
 
     @staticmethod
-    def _prepare_segments(prompt_ids, batch_size, prompt_condition_type, generation_config):
-        generation_config.prompt_condition_type = prompt_condition_type
-
-        if prompt_ids is not None and prompt_condition_type == "first-segment":
-            prompt_ids = prompt_ids[1:] if prompt_ids[0] == generation_config.prev_sot_token_id else prompt_ids
+    def _prepare_segments(prompt_ids, batch_size, generation_config):
+        if prompt_ids is not None and generation_config.prompt_condition_type == "first-segment":
+            prev_sot_token_id = getattr(generation_config, "prev_sot_token_id", None)
+            prompt_ids = prompt_ids[1:] if prompt_ids[0] == prev_sot_token_id else prompt_ids
             current_segments = [[{"tokens": prompt_ids}] for _ in range(batch_size)]
         else:
             current_segments = [[] for _ in range(batch_size)]
 
-        if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
-            raise ValueError(
-                "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`."
-            )
-
         return current_segments
 
     def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config):
@@ -1250,6 +1251,25 @@ def _set_thresholds_and_condition(
             else getattr(generation_config, "condition_on_prev_tokens", None)
         )
 
+    @staticmethod
+    def _set_prompt_condition_type(generation_config, prompt_condition_type):
+        allowed_cond_types = ["first-segment", "all-segments"]
+
+        # default to "first-segment"
+        prompt_condition_type = prompt_condition_type or allowed_cond_types[0]
+
+        if prompt_condition_type not in allowed_cond_types:
+            raise ValueError(
+                f"`prompt_condition_type={prompt_condition_type} does not exist. Make sure to set `prompt_condition_type` to one of {', '.join(allowed_cond_types)}"
+            )
+
+        if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
+            raise ValueError(
+                "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`."
+            )
+
+        generation_config.prompt_condition_type = prompt_condition_type
+
     @staticmethod
     def _set_condition_on_prev_tokens(condition_on_prev_tokens, generation_config):
         condition_on_prev_tokens = (
@@ -1263,7 +1283,7 @@ def _set_condition_on_prev_tokens(condition_on_prev_tokens, generation_config):
     def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames):
         if batch_size > 1 and attention_mask is None:
             raise ValueError(
-                "When doing long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
+                "When doing batched long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
             )
         elif batch_size > 1:
             max_frames = attention_mask.sum(-1).cpu().to(torch.long)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 17f23a263283a8..4bd8fa823f1158 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1319,6 +1319,46 @@ def test_generate_with_prompt_ids_max_length(self):
 
         model.generate(input_features, max_new_tokens=1, prompt_ids=prompt_ids)
 
+    def test_generate_longform_with_prompt_ids(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+
+        prompt_ids = torch.arange(5).to(torch_device)
+        model.generation_config.no_timestamps_token_id = 11
+        model.generation_config.pad_token_id = 10
+
+        # make sure prompt token ids [0-9] can't be generated
+        model.generation_config.suppress_tokens = list(range(10))
+
+        input_features = input_dict["input_features"]
+
+        language = "<|de|>"
+        lang_id = 6
+
+        input_features = input_features.repeat(1, 1, 50)
+        attention_mask = torch.ones_like(input_features, dtype=torch.long)[:, 0]
+
+        for prompt_type in ["first-segment", "all-segments"]:
+            for task_id, task in enumerate(["translate", "transcribe"]):
+                task_id = 7 + task_id
+
+                model.generation_config.__setattr__("lang_to_id", {language: lang_id})
+                model.generation_config.__setattr__("task_to_id", {task: task_id})
+
+                output = model.generate(
+                    input_features,
+                    attention_mask=attention_mask,
+                    prompt_condition_type=prompt_type,
+                    max_new_tokens=5,
+                    task=task,
+                    language=language,
+                    prompt_ids=prompt_ids,
+                    condition_on_prev_tokens=True,
+                )
+                for row in output.tolist():
+                    # make sure no token below 10 is in generated output => this means for long-form prompt ids should NOT be returned
+                    assert not any(i in row for i in model.generation_config.suppress_tokens)
+
     def _check_longform_generate_single_batch(self, condition_on_prev_tokens):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -2150,6 +2190,56 @@ def test_whisper_longform_single_batch(self):
 
         assert is_increasing
 
+    @slow
+    def test_whisper_longform_prompt_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model = model.to("cuda")
+
+        prompt = "Mr. Kilter, Ruggedo."  # let's force Mr. Quilter -> Mr. Kilter
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
+        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
+
+        first_text = ds["validation"][0]["text"].lower()
+        last_text = ds["validation"][-1]["text"].lower()
+
+        input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
+            "input_features"
+        ]
+        input_features = input_features.to(device="cuda")
+
+        result = model.generate(
+            input_features,
+            prompt_ids=prompt_ids,
+            return_timestamps=True,
+            prompt_condition_type="first-segment",
+            condition_on_prev_tokens=True,
+        )
+        decoded_first_segment = processor.batch_decode(result, skip_special_tokens=True)
+
+        result = model.generate(
+            input_features,
+            prompt_ids=prompt_ids,
+            return_timestamps=True,
+            prompt_condition_type="all-segments",
+            condition_on_prev_tokens=True,
+        )
+        decoded_all_segments = processor.batch_decode(result, skip_special_tokens=True)
+
+        # show that first segment has quilter and last segment has ruggedo
+        assert "quilter" in first_text
+        assert "ruggedo" in last_text
+
+        # condition on first segment correctly changes to kilter in first segment, but does not transcribe "ruggedo" correctly
+        assert "kilter" in decoded_first_segment[0][: len(first_text)].lower()
+        assert "ruggedo" not in decoded_first_segment[0][-len(last_text) :].lower()
+
+        # condition on all-segment correctly changes to kilter in first segment and correctly transcribes "ruggedo"
+        assert "kilter" in decoded_all_segments[0][: len(first_text)].lower()
+        assert "ruggedo" in decoded_all_segments[0][-len(last_text) :].lower()
+
     @slow
     def test_whisper_longform_single_batch_prev_cond(self):
         # fmt: off

From ce325c933446be543eb8be6b0a2dcd649d24d564 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 08:43:01 +0000
Subject: [PATCH 14/25] up

---
 .../run_image_classification_no_trainer.py                    | 4 +---
 examples/pytorch/image-pretraining/run_mim_no_trainer.py      | 4 +---
 examples/pytorch/language-modeling/run_clm_no_trainer.py      | 4 +---
 examples/pytorch/language-modeling/run_mlm_no_trainer.py      | 4 +---
 examples/pytorch/multiple-choice/run_swag_no_trainer.py       | 4 +---
 .../question-answering/run_qa_beam_search_no_trainer.py       | 4 +---
 examples/pytorch/question-answering/run_qa_no_trainer.py      | 4 +---
 .../run_semantic_segmentation_no_trainer.py                   | 4 +---
 .../pytorch/summarization/run_summarization_no_trainer.py     | 4 +---
 9 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index fa28a2a0d103b3..7c3aa725ea46ed 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -439,9 +439,7 @@ def collate_fn(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 4bc8b22d5ae14f..6d5c1849e5b3b2 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -627,9 +627,7 @@ def preprocess_images(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 15d513b0c928b6..a8e9b608e466d9 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -527,9 +527,7 @@ def group_texts(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index d9b8120a98e883..97860cd2666abb 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -564,9 +564,7 @@ def group_texts(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 9ad72548329184..533072fc0af15a 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -511,9 +511,7 @@ def preprocess_function(examples):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 905189d0d41a88..65c38edca295df 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -751,9 +751,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 1a58f6ce442f0e..5cb00d5225cb8f 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -781,9 +781,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 0ba6c5957d1530..315c0ce4611cd2 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -514,9 +514,7 @@ def preprocess_val(example_batch):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 96ccb552ed164f..3212ef1af52b04 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -581,9 +581,7 @@ def postprocess_text(preds, labels):
         name=args.lr_scheduler_type,
         optimizer=optimizer,
         num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps
-        if overrode_max_train_steps
-        else args.max_train_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps if overrode_max_train_steps else args.max_train_steps * accelerator.num_processes,
     )
 
     # Prepare everything with our `accelerator`.

From 0f06889b1f178a7310ce88a75ab8057b0e363ce3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 08:59:12 +0000
Subject: [PATCH 15/25] up

---
 tests/models/whisper/test_modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 4bd8fa823f1158..88427455273e14 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1281,7 +1281,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         input_features = input_dict["input_features"]
-        prompt_ids = torch.arange(5).to("cuda")
+        prompt_ids = torch.arange(5).to(torch_device)
         forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
 
         output = model.generate(

From 890eefb825b1518799c5b0244934a1013ca7c7a0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Jan 2024 09:44:08 +0000
Subject: [PATCH 16/25] up

---
 tests/models/whisper/test_modeling_whisper.py | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 88427455273e14..01285cfdd8f58f 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1302,7 +1302,7 @@ def test_generate_with_prompt_ids_max_length(self):
 
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         input_features = input_dict["input_features"]
-        decoder_input_ids = torch.arange(5).to("cuda")
+        decoder_input_ids = torch.arange(5).to(torch_device)
         prompt_ids = decoder_input_ids[:4]
         max_new_tokens = 8
 
@@ -2008,7 +2008,7 @@ def test_generate_with_prompt_ids(self):
         input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
 
         output_without_prompt = model.generate(input_features)
-        prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="pt").to("cuda")
+        prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="pt").to(torch_device)
         output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
 
         expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
@@ -2031,7 +2031,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         language = "de"
         expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
         prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
 
         output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
         text = processor.decode(output[0])
@@ -2047,7 +2047,7 @@ def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
         input_speech = self._load_datasamples(1)
         input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
         prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
 
         model.generation_config.forced_decoder_ids = None
         model.config.forced_decoder_ids = None
@@ -2078,7 +2078,9 @@ def test_speculative_decoding_distil(self):
         dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         sample = dataset[0]["audio"]
 
-        input_features = processor(sample["array"], return_tensors="pt").input_features.to("cuda").to(torch.float16)
+        input_features = (
+            processor(sample["array"], return_tensors="pt").input_features.to(torch_device).to(torch.float16)
+        )
 
         # warm up assisted decoding
         _ = model.generate(input_features, assistant_model=assistant_model)
@@ -2126,7 +2128,9 @@ def test_speculative_decoding_non_distil(self):
         dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         sample = dataset[0]["audio"]
 
-        input_features = processor(sample["array"], return_tensors="pt").input_features.to("cuda").to(torch.float16)
+        input_features = (
+            processor(sample["array"], return_tensors="pt").input_features.to(torch_device).to(torch.float16)
+        )
 
         # warm up assisted decoding
         _ = model.generate(input_features, assistant_model=assistant_model)
@@ -2161,7 +2165,7 @@ def test_whisper_longform_single_batch(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
         one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
@@ -2169,7 +2173,7 @@ def test_whisper_longform_single_batch(self):
         input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
             "input_features"
         ]
-        input_features = input_features.to(device="cuda")
+        input_features = input_features.to(device=torch_device)
 
         result = model.generate(input_features, return_timestamps=True)
         decoded = processor.batch_decode(result, skip_special_tokens=True)
@@ -2194,10 +2198,10 @@ def test_whisper_longform_single_batch(self):
     def test_whisper_longform_prompt_ids(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         prompt = "Mr. Kilter, Ruggedo."  # let's force Mr. Quilter -> Mr. Kilter
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to("cuda")
+        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
 
         ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
         one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
@@ -2208,7 +2212,7 @@ def test_whisper_longform_prompt_ids(self):
         input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
             "input_features"
         ]
-        input_features = input_features.to(device="cuda")
+        input_features = input_features.to(device=torch_device)
 
         result = model.generate(
             input_features,
@@ -2248,7 +2252,7 @@ def test_whisper_longform_single_batch_prev_cond(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
         one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
@@ -2256,7 +2260,7 @@ def test_whisper_longform_single_batch_prev_cond(self):
         input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
             "input_features"
         ]
-        input_features = input_features.to(device="cuda")
+        input_features = input_features.to(device=torch_device)
 
         gen_kwargs = {
             "return_timestamps": True,
@@ -2284,7 +2288,7 @@ def test_whisper_longform_multi_batch(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
         one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
@@ -2297,7 +2301,7 @@ def test_whisper_longform_multi_batch(self):
         decoded_single = []
         for audio in audios:
             inputs = processor(audio, return_tensors="pt", truncation=False)
-            inputs = inputs.to(device="cuda")
+            inputs = inputs.to(device=torch_device)
 
             result = model.generate(**inputs, return_timestamps=True)
             decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
@@ -2305,7 +2309,7 @@ def test_whisper_longform_multi_batch(self):
         inputs = processor(
             audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
         )
-        inputs = inputs.to(device="cuda")
+        inputs = inputs.to(device=torch_device)
 
         result = model.generate(**inputs, return_timestamps=True)
         decoded_all = processor.batch_decode(result, skip_special_tokens=True)
@@ -2333,7 +2337,7 @@ def test_whisper_longform_multi_batch_prev_cond(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
         one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
@@ -2355,7 +2359,7 @@ def test_whisper_longform_multi_batch_prev_cond(self):
         decoded_single = []
         for audio in audios:
             inputs = processor(audio, return_tensors="pt", truncation=False)
-            inputs = inputs.to(device="cuda")
+            inputs = inputs.to(device=torch_device)
 
             result = model.generate(**inputs, **gen_kwargs)
             decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
@@ -2383,7 +2387,7 @@ def test_whisper_longform_multi_batch_hard(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
         ds = ds.cast_column("audio", Audio(sampling_rate=16000))
@@ -2396,7 +2400,7 @@ def test_whisper_longform_multi_batch_hard(self):
         decoded_single = []
         for audio in audios:
             inputs = processor(audio, return_tensors="pt", truncation=False, sampling_rate=16_000)
-            inputs = inputs.to(device="cuda")
+            inputs = inputs.to(device=torch_device)
 
             result = model.generate(**inputs, return_timestamps=True)
             decoded_single += processor.batch_decode(result, skip_special_tokens=True)
@@ -2404,7 +2408,7 @@ def test_whisper_longform_multi_batch_hard(self):
         inputs = processor(
             audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
         )
-        inputs = inputs.to(device="cuda")
+        inputs = inputs.to(device=torch_device)
 
         result = model.generate(**inputs, return_timestamps=True)
         decoded_all = processor.batch_decode(result, skip_special_tokens=True)
@@ -2430,7 +2434,7 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self):
 
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        model = model.to("cuda")
+        model = model.to(torch_device)
 
         ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
         ds = ds.cast_column("audio", Audio(sampling_rate=16000))
@@ -2443,7 +2447,7 @@ def test_whisper_longform_multi_batch_hard_prev_cond(self):
         inputs = processor(
             audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
         )
-        inputs = inputs.to(device="cuda")
+        inputs = inputs.to(device=torch_device)
 
         gen_kwargs = {
             "return_timestamps": True,

From 6289521f195eb8c9e5a4fffaaf683130989a1df7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 29 Jan 2024 16:35:24 +0000
Subject: [PATCH 17/25] Fix lang detection behavior

---
 .../models/whisper/generation_whisper.py      | 90 ++++++++++++++++---
 1 file changed, 78 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 6574c87b4cda56..7daf2c3b773a9b 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -530,9 +530,11 @@ def generate(
         )
 
         # pass self.config for backward compatibility
-        init_tokens = self.retrieve_init_tokens(
+        init_tokens = self._retrieve_init_tokens(
+            input_features,
             generation_config=generation_config,
             config=self.config,
+            num_segment_frames=num_segment_frames,
             kwargs=kwargs,
         )
         # TODO(Sanchit) - passing `decoder_input_ids` is deprecated. One should use `prompt_ids` instead
@@ -1070,8 +1072,7 @@ def _set_language_and_task(language, task, is_multilingual, generation_config):
                 )
             generation_config.task = task
 
-    @staticmethod
-    def retrieve_init_tokens(generation_config, config, kwargs):
+    def _retrieve_init_tokens(self, input_features, generation_config, config, num_segment_frames, kwargs):
         def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
             """short function to replace num with a itr in lst"""
             found = any(i in lst for i in itr)
@@ -1081,14 +1082,14 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
                 lst.append(num)
             return lst
 
-        forced_decoder_ids = None
-        # Legacy code for backward compatibility
-        if hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
-            forced_decoder_ids = config.forced_decoder_ids
+        if kwargs.get("forced_decoder_ids", None) is not None:
+            forced_decoder_ids = kwargs["forced_decoder_ids"]
         elif hasattr(generation_config, "forced_decoder_ids") and generation_config.forced_decoder_ids is not None:
             forced_decoder_ids = generation_config.forced_decoder_ids
+        elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
+            forced_decoder_ids = config.forced_decoder_ids
         else:
-            forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
+            forced_decoder_ids = None
 
         task = getattr(generation_config, "task", None)
         language = getattr(generation_config, "language", None)
@@ -1144,10 +1145,20 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
             # if language is defined it'll overwrite language ids that might have already been defined via the generation_config
             replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
-        elif task is not None and hasattr(generation_config, "lang_to_id"):
-            # default to English
-            lang_id = generation_config.decoder_start_token_id + 1  # start_token_id + 1 is <en>
-            replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
+        elif len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None):
+            # language is not defined or intentially set to `None` to trigger language detection
+            lang_ids = self.detect_language(input_features=input_features, encoder_outputs=kwargs.get("encoder_outputs", None), generation_config=generation_config, num_segment_frames=num_segment_frames)
+            
+            if torch.unique(lang_ids).shape[0] > 1:
+                raise ValueError("Multiple languages detected when trying to guess the target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `languag='...'` or make sure all input audio is of the same language.")
+
+            lang_id = lang_ids[0].item()
+            
+            # append or replace lang_id to init_tokens
+            if len(init_tokens) > 1:
+                init_tokens[1] = lang_id 
+            else:
+                init_tokens.append(lang_id)
 
         if task is not None:
             if task in TASK_IDS:
@@ -1177,6 +1188,61 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
         return init_tokens
 
+    def detect_language(self, input_features: Optional[torch.FloatTensor], encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]], generation_config: Optional[GenerationConfig] = None, num_segment_frames: int = 3000) -> torch.Tensor:
+        """
+        Detects language from log-mel input features or encoder_outputs
+
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+                Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+                `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+                hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            num_segment_frames (`int`, defaults to 3000):
+                The number of log-mel frames the model expects
+
+        Return:
+            A `torch.LongTensor` representing the detected language ids.
+        """
+        if input_features is None and encoder_outputs is None:
+            raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
+        elif not input_features is None and not encoder_outputs is None:
+            raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
+        elif input_features is not None:
+            inputs = {"input_features": input_features[:, :, :num_segment_frames]}
+            batch_size = input_features.shape[0]
+        elif encoder_outputs is not None:
+            inputs = {"encoder_outputs": encoder_outputs}
+            batch_size = encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+
+        generation_config = generation_config or self.generation_config
+        decoder_input_ids = torch.ones((batch_size, 1), device=self.device, dtype=torch.long) * generation_config.decoder_start_token_id
+
+        with torch.no_grad():
+            logits = self(**inputs, decoder_input_ids=decoder_input_ids).logits[:, -1]
+
+        non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
+        non_lang_mask[list(generation_config.lang_to_id.values())] = False
+
+        logits[:, non_lang_mask] = -np.inf
+
+        lang_ids = logits.argmax(-1)
+
+        return lang_ids
+
+
     @staticmethod
     def _check_decoder_input_ids(prompt_ids, init_tokens, is_shortform, kwargs):
         decoder_input_ids = kwargs.get("decoder_input_ids", None)

From 633fab02a5b85c26cd668fb05fad8577b14fd117 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 29 Jan 2024 16:35:35 +0000
Subject: [PATCH 18/25] Fix lang detection behavior

---
 .../models/whisper/generation_whisper.py      | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 7daf2c3b773a9b..68e95ac7ea2f95 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1147,16 +1147,23 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
             replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
         elif len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None):
             # language is not defined or intentially set to `None` to trigger language detection
-            lang_ids = self.detect_language(input_features=input_features, encoder_outputs=kwargs.get("encoder_outputs", None), generation_config=generation_config, num_segment_frames=num_segment_frames)
-            
+            lang_ids = self.detect_language(
+                input_features=input_features,
+                encoder_outputs=kwargs.get("encoder_outputs", None),
+                generation_config=generation_config,
+                num_segment_frames=num_segment_frames,
+            )
+
             if torch.unique(lang_ids).shape[0] > 1:
-                raise ValueError("Multiple languages detected when trying to guess the target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `languag='...'` or make sure all input audio is of the same language.")
+                raise ValueError(
+                    "Multiple languages detected when trying to guess the target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `languag='...'` or make sure all input audio is of the same language."
+                )
 
             lang_id = lang_ids[0].item()
-            
+
             # append or replace lang_id to init_tokens
             if len(init_tokens) > 1:
-                init_tokens[1] = lang_id 
+                init_tokens[1] = lang_id
             else:
                 init_tokens.append(lang_id)
 
@@ -1188,7 +1195,13 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
         return init_tokens
 
-    def detect_language(self, input_features: Optional[torch.FloatTensor], encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]], generation_config: Optional[GenerationConfig] = None, num_segment_frames: int = 3000) -> torch.Tensor:
+    def detect_language(
+        self,
+        input_features: Optional[torch.FloatTensor],
+        encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]],
+        generation_config: Optional[GenerationConfig] = None,
+        num_segment_frames: int = 3000,
+    ) -> torch.Tensor:
         """
         Detects language from log-mel input features or encoder_outputs
 
@@ -1218,17 +1231,22 @@ def detect_language(self, input_features: Optional[torch.FloatTensor], encoder_o
         """
         if input_features is None and encoder_outputs is None:
             raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
-        elif not input_features is None and not encoder_outputs is None:
+        elif input_features is not None and encoder_outputs is not None:
             raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
         elif input_features is not None:
             inputs = {"input_features": input_features[:, :, :num_segment_frames]}
             batch_size = input_features.shape[0]
         elif encoder_outputs is not None:
             inputs = {"encoder_outputs": encoder_outputs}
-            batch_size = encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+            batch_size = (
+                encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+            )
 
         generation_config = generation_config or self.generation_config
-        decoder_input_ids = torch.ones((batch_size, 1), device=self.device, dtype=torch.long) * generation_config.decoder_start_token_id
+        decoder_input_ids = (
+            torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
+            * generation_config.decoder_start_token_id
+        )
 
         with torch.no_grad():
             logits = self(**inputs, decoder_input_ids=decoder_input_ids).logits[:, -1]
@@ -1242,7 +1260,6 @@ def detect_language(self, input_features: Optional[torch.FloatTensor], encoder_o
 
         return lang_ids
 
-
     @staticmethod
     def _check_decoder_input_ids(prompt_ids, init_tokens, is_shortform, kwargs):
         decoder_input_ids = kwargs.get("decoder_input_ids", None)

From 4fc1c5e593cc411ab6be2247d0f19a298d87ee1b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 29 Jan 2024 16:54:14 +0000
Subject: [PATCH 19/25] Add lang detection test

---
 .../models/whisper/generation_whisper.py      |  4 +--
 tests/models/whisper/test_modeling_whisper.py | 32 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 68e95ac7ea2f95..871699b95e1715 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1197,8 +1197,8 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
     def detect_language(
         self,
-        input_features: Optional[torch.FloatTensor],
-        encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]],
+        input_features: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None,
         generation_config: Optional[GenerationConfig] = None,
         num_segment_frames: int = 3000,
     ) -> torch.Tensor:
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 01285cfdd8f58f..130999768e3fcf 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -38,7 +38,8 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_flax_available, is_torch_available
+from huggingface_hub import hf_hub_download
+from transformers.utils import cached_property, is_flax_available, is_torch_available, is_torchaudio_available
 from transformers.utils.import_utils import is_datasets_available
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -142,6 +143,10 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
             return scores
 
 
+if is_torchaudio_available():
+    import torchaudio
+
+
 if is_flax_available():
     import jax.numpy as jnp
 
@@ -2020,6 +2025,31 @@ def test_generate_with_prompt_ids(self):
         self.assertEqual(output_without_prompt, expected_without_prompt)
         self.assertEqual(output_with_prompt, expected_with_prompt)
 
+    @slow
+    def test_language_detection(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(4)[-1:]
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        lang_id = model.detect_language(input_features)[0].item()
+
+        ids_to_lang = {v: k for k,v in model.generation_config.lang_to_id.items()}
+
+        assert ids_to_lang[lang_id] == "<|en|>"
+
+        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
+
+        raw_audio, sr = torchaudio.load(audio)
+        input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio).numpy()
+
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        lang_id = model.detect_language(input_features)[0].item()
+
+        assert ids_to_lang[lang_id] == "<|hi|>"
+
     @slow
     def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

From 968197a647f96fa9c145a51843021fea7fa68083 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 29 Jan 2024 18:18:05 +0000
Subject: [PATCH 20/25] Fix lang detection behavior

---
 tests/models/whisper/test_modeling_whisper.py | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 130999768e3fcf..99fa0c5ceeabcd 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import pytest
+from huggingface_hub import hf_hub_download
 
 import transformers
 from transformers import WhisperConfig
@@ -38,7 +39,6 @@
     slow,
     torch_device,
 )
-from huggingface_hub import hf_hub_download
 from transformers.utils import cached_property, is_flax_available, is_torch_available, is_torchaudio_available
 from transformers.utils.import_utils import is_datasets_available
 
@@ -2035,7 +2035,7 @@ def test_language_detection(self):
 
         lang_id = model.detect_language(input_features)[0].item()
 
-        ids_to_lang = {v: k for k,v in model.generation_config.lang_to_id.items()}
+        ids_to_lang = {v: k for k, v in model.generation_config.lang_to_id.items()}
 
         assert ids_to_lang[lang_id] == "<|en|>"
 
@@ -2050,6 +2050,89 @@ def test_language_detection(self):
 
         assert ids_to_lang[lang_id] == "<|hi|>"
 
+    @slow
+    def test_default_multilingual_transcription_short_form(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+
+        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
+
+        raw_audio, sr = torchaudio.load(audio)
+        input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio).numpy()
+
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        # model.generation_config.forced_decoder_ids defaults to [1, null] for lang_token
+        sequences = model.generate(input_features)
+
+        transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
+
+        assert (
+            transcription
+            == "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>"
+        )
+
+        # set forced_decoder_ids to English
+        model.generation_config.forced_decoder_ids[0][-1] = 50259
+
+        sequences = model.generate(input_features)
+        transcription = processor.batch_decode(sequences, skip_special_tokens=False)
+
+        assert (
+            transcription
+            == "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> MIRCHI MET, which is the name of the Bible.<|endoftext|>"
+        )
+
+        # even if forced_decoder_ids defaults to lang_token==English, setting `language=None` triggers lang_detection
+        sequences = model.generate(input_features, language=None)
+        transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
+
+        assert (
+            transcription
+            == "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>"
+        )
+
+    @slow
+    def test_default_multilingual_transcription_long_form(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
+        model.to(torch_device)
+
+        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
+
+        raw_audio, sr = torchaudio.load(audio)
+        input_speech = torchaudio.transforms.Resample(sr, 16_000)(raw_audio)
+
+        input_speech = input_speech.repeat(1, 10).numpy()
+        input_features = processor(
+            input_speech, return_tensors="pt", padding="longest", truncation=False
+        ).input_features.to(torch_device)
+
+        # model.generation_config.forced_decoder_ids defaults to [1, null] for lang_token
+        sequences = model.generate(input_features)
+
+        transcription = processor.batch_decode(sequences)[0]
+
+        assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
+
+        # set forced_decoder_ids to English
+        model.generation_config.forced_decoder_ids[0][-1] = 50259
+
+        sequences = model.generate(input_features)
+        transcription = processor.batch_decode(sequences)[0]
+
+        assert (
+            transcription
+            == " How many different species are there in the chilli? How many different species are there in the chili?"
+        )
+
+        # even if forced_decoder_ids defaults to lang_token==English, setting `language=None` triggers lang_detection
+        sequences = model.generate(input_features, language=None)
+        transcription = processor.batch_decode(sequences)[0]
+
+        assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
+
     @slow
     def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

From 1ed2880e41fea1b037313bb7e3102df213a4168e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 30 Jan 2024 12:50:23 +0200
Subject: [PATCH 21/25] make style

---
 src/transformers/models/whisper/generation_whisper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 871699b95e1715..210d99550d2b39 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1122,6 +1122,7 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
             # TODO(Sanchit): set generation_config.forced_decoder_ids to None for v4.39
             generation_config.forced_decoder_ids = forced_decoder_ids if len(forced_decoder_ids) > 0 else None
 
+        is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None)
         if language is not None:
             if language in generation_config.lang_to_id.keys():
                 language_token = language
@@ -1145,7 +1146,7 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
             # if language is defined it'll overwrite language ids that might have already been defined via the generation_config
             replace_or_add(init_tokens, lang_id, generation_config.lang_to_id.values())
-        elif len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None):
+        elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
             # language is not defined or intentially set to `None` to trigger language detection
             lang_ids = self.detect_language(
                 input_features=input_features,

From 0be1822c2b6c05ffd74eb60565256e77bd92394e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 30 Jan 2024 16:09:16 +0200
Subject: [PATCH 22/25] Update
 src/transformers/models/whisper/generation_whisper.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/whisper/generation_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 210d99550d2b39..8705a63ac7043f 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1157,7 +1157,7 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
 
             if torch.unique(lang_ids).shape[0] > 1:
                 raise ValueError(
-                    "Multiple languages detected when trying to guess the target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `languag='...'` or make sure all input audio is of the same language."
+                    "Multiple languages detected when trying to predict the most likely target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `language='...'` or make sure all input audio is of the same language."
                 )
 
             lang_id = lang_ids[0].item()

From 524cd3c01fa2f13c15c81d3326c76a024518dc22 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 30 Jan 2024 16:30:07 +0200
Subject: [PATCH 23/25] better error message

---
 src/transformers/models/whisper/generation_whisper.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 210d99550d2b39..090db63a2900a4 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1113,9 +1113,10 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
                 forced_decoder_ids = forced_decoder_ids[1:]
                 i += 1
 
+            # TODO(Sanchit): Let's make sure we don't allow incorrectly / weirdly formatted `forced_decoder_ids` after transformers v4.39
             if len(forced_decoder_ids) > 0:
                 warnings.warn(
-                    f"You are using token ids in `forced_decoder_ids` that do not seem to be part of the initial prompt ids: {forced_decoder_ids}. This functionality has been deprecated and will throw an error in v4.39.",
+                    f"You are using token ids in `forced_decoder_ids` that do not seem to correctly follow the prompt pattern of Whisper. Make sure that {forced_decoder_ids} has an entry for all indices >= 1 and < {forced_decoder_ids[0][0]}. `forced_decoder_ids` will be passed as a logit processor, but note that this functionality has been deprecated and will throw an error in v4.39.",
                     FutureWarning,
                 )
 
@@ -1189,6 +1190,9 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
         ):
             init_tokens.append(generation_config.no_timestamps_token_id)
         elif generation_config.return_timestamps and init_tokens[-1] == generation_config.no_timestamps_token_id:
+            logger.info(
+                "<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `'True'`."
+            )
             init_tokens = init_tokens[:-1]
 
         # let's make sure we don't pass `None` tokens as prompt tokens

From d0019fa754454b61dca269ac0ded49c33164fada Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 30 Jan 2024 20:28:26 +0000
Subject: [PATCH 24/25] make style tests

---
 tests/models/whisper/test_modeling_whisper.py   | 17 +----------------
 ...st_pipelines_automatic_speech_recognition.py |  2 +-
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 99fa0c5ceeabcd..1f92f1523dbbde 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -2077,20 +2077,11 @@ def test_default_multilingual_transcription_short_form(self):
         model.generation_config.forced_decoder_ids[0][-1] = 50259
 
         sequences = model.generate(input_features)
-        transcription = processor.batch_decode(sequences, skip_special_tokens=False)
-
-        assert (
-            transcription
-            == "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> MIRCHI MET, which is the name of the Bible.<|endoftext|>"
-        )
-
-        # even if forced_decoder_ids defaults to lang_token==English, setting `language=None` triggers lang_detection
-        sequences = model.generate(input_features, language=None)
         transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
 
         assert (
             transcription
-            == "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>"
+            == "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> MIRCHI MET, which is the name of the Bible.<|endoftext|>"
         )
 
     @slow
@@ -2127,12 +2118,6 @@ def test_default_multilingual_transcription_long_form(self):
             == " How many different species are there in the chilli? How many different species are there in the chili?"
         )
 
-        # even if forced_decoder_ids defaults to lang_token==English, setting `language=None` triggers lang_detection
-        sequences = model.generate(input_features, language=None)
-        transcription = processor.batch_decode(sequences)[0]
-
-        assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
-
     @slow
     def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index e99f8d6862e386..f3a51a4b77961a 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1462,7 +1462,7 @@ def test_slow_unfinished_sequence(self):
             out,
             {
                 "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
-                "chunks": [{"timestamp": (0.26, None), "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं"}],
+                "chunks": [{"timestamp": (0.58, None), "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं"}],
             },
         )
 

From f6c70f0ec07f01e4a8d4af9f3d4cdebd5b3ca2a6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 30 Jan 2024 20:42:21 +0000
Subject: [PATCH 25/25] add warning

---
 .../models/whisper/generation_whisper.py             | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index a2868f907f1b56..0d6addb5631bec 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1082,18 +1082,24 @@ def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
                 lst.append(num)
             return lst
 
+        task = getattr(generation_config, "task", None)
+        language = getattr(generation_config, "language", None)
+
         if kwargs.get("forced_decoder_ids", None) is not None:
             forced_decoder_ids = kwargs["forced_decoder_ids"]
         elif hasattr(generation_config, "forced_decoder_ids") and generation_config.forced_decoder_ids is not None:
             forced_decoder_ids = generation_config.forced_decoder_ids
+
+            if language is None and task is None and forced_decoder_ids[0][1] is None:
+                logger.warning_once(
+                    "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English."
+                    "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`."
+                )
         elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
             forced_decoder_ids = config.forced_decoder_ids
         else:
             forced_decoder_ids = None
 
-        task = getattr(generation_config, "task", None)
-        language = getattr(generation_config, "language", None)
-
         if forced_decoder_ids is not None and task is not None:
             logger.info(
                 f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}."