From a837f664fc394787e03c57aa08ada9d2ca22e972 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Thu, 13 Mar 2025 22:27:37 +0000
Subject: [PATCH 001/116] First pass at speech granite

Add encoder / projector, rename things
---
 src/transformers/models/__init__.py           |   1 +
 .../models/granite_speech/__init__.py         |  28 +++
 .../configuration_granite_speech.py           | 105 +++++++++
 .../models/granite_speech/conformer.py        | 210 ++++++++++++++++++
 .../models/granite_speech/encoder.py          |  47 ++++
 .../feature_extraction_granite_speech.py      | 104 +++++++++
 .../granite_speech/modeling_granite_speech.py | 150 +++++++++++++
 .../processing_granite_speech.py              | 113 ++++++++++
 .../models/granite_speech/projector.py        |  38 ++++
 9 files changed, 796 insertions(+)
 create mode 100644 src/transformers/models/granite_speech/__init__.py
 create mode 100644 src/transformers/models/granite_speech/configuration_granite_speech.py
 create mode 100644 src/transformers/models/granite_speech/conformer.py
 create mode 100644 src/transformers/models/granite_speech/encoder.py
 create mode 100644 src/transformers/models/granite_speech/feature_extraction_granite_speech.py
 create mode 100644 src/transformers/models/granite_speech/modeling_granite_speech.py
 create mode 100644 src/transformers/models/granite_speech/processing_granite_speech.py
 create mode 100644 src/transformers/models/granite_speech/projector.py

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7b219104548a..5d30143f9806 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -127,6 +127,7 @@
     from .granite import *
     from .granitemoe import *
     from .granitemoeshared import *
+    from .granite_speech import *
     from .grounding_dino import *
     from .groupvit import *
     from .helium import *
diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
new file mode 100644
index 000000000000..41b94195c901
--- /dev/null
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+if TYPE_CHECKING:
+    # TODO - handle optional dependencies
+    from .configuration_granite_speech import *
+    from .feature_extraction_granite_speech import *
+    from .modeling_granite_speech import *
+    from .processing_granite_speech import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
new file mode 100644
index 000000000000..e3814af4fb22
--- /dev/null
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -0,0 +1,105 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import AutoConfig
+from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
+
+
+class GraniteSpeechEncoderConfig(PretrainedConfig):
+    def __init__(
+        self,
+        input_dim=160,
+        num_layers=10,
+        hidden_dim=1024,
+        feedforward_mult=4,
+        num_heads=8,
+        dim_head=128,
+        output_dim=42,
+        context_size=200,
+        dropout=0.1,
+        conv_kernel_size=15,
+        conv_expansion_factor=2,
+    ):
+        self.input_dim = input_dim
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.feedforward_mult = feedforward_mult
+        self.num_heads = num_heads
+        self.dim_head = dim_head
+        self.output_dim = output_dim
+        self.context_size = context_size
+
+        self.dropout = dropout
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_expansion_factor = conv_expansion_factor
+
+
+class GraniteSpeechProjectorConfig(Blip2QFormerConfig):
+    def __init__(
+        self,
+        llm_dim=4096,
+        downsample_rate=5,
+        window_size=15,
+        hidden_size=1024,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        num_hidden_layers=2,
+        encoder_hidden_size=1024,
+        cross_attention_frequency=1,
+        max_position_embeddings=2048,
+        use_qformer_text_input=False,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            encoder_hidden_size=encoder_hidden_size,
+            cross_attention_frequency=cross_attention_frequency,
+            max_position_embeddings=max_position_embeddings,
+            use_qformer_text_input=use_qformer_text_input,
+        )
+
+        self.downsample_rate = downsample_rate
+        self.window_size = window_size
+        self.llm_dim = llm_dim
+
+
+class GraniteSpeechConfig(PretrainedConfig):
+    model_type = "speech_granite"
+    # TODO - Probably should consolidate these into a single config
+    sub_configs = {
+        "llm_config": AutoConfig,
+        "encoder_config": GraniteSpeechEncoderConfig,
+        "projector_config": GraniteSpeechProjectorConfig,
+    }
+
+    def __init__(
+        self,
+        encoder_config=None,
+        llm_config=None,
+        projector_config=None,
+        # TODO - need to figure out how to handle lora here / separation of peft integration with peft
+        # Keeping it here during the initial porting
+        lora_r=64,
+        lora_alpha=32,
+        lora_modules=["q_proj", "v_proj"],
+        # TODO - we should use a text config here instead of the direct model, then use from_config()
+        llm_name="ibm-granite/granite-3.1-8b-instruct",
+        audio_token_index=49155,
+        **kwargs,
+    ):
+        if llm_config is None:
+            llm_config = AutoConfig.from_pretrained(llm_name)
+        if encoder_config is None:
+            encoder_config = GraniteSpeechEncoderConfig()
+        if projector_config is None:
+            projector_config = GraniteSpeechProjectorConfig()
+
+        self.encoder_config = encoder_config
+        self.llm_config = llm_config
+        self.projector_config = projector_config
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_modules = lora_modules
+        self.llm_name = llm_name
+        self.audio_token_index = audio_token_index
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/granite_speech/conformer.py b/src/transformers/models/granite_speech/conformer.py
new file mode 100644
index 000000000000..563297fd8827
--- /dev/null
+++ b/src/transformers/models/granite_speech/conformer.py
@@ -0,0 +1,210 @@
+# From https://github.com/lucidrains/conformer.git
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+# helper functions
+
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+
+
+class Permute(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.dims = dims
+        
+    def forward(self, x):
+        x = x.permute(self.dims)
+        return x
+
+
+# helper classes
+
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in, bias=False)
+
+    def forward(self, x):
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+
+# attention, feedforward, and conv module
+
+class Scale(nn.Module):
+    def __init__(self, scale, fn):
+        super().__init__()
+        self.fn = fn
+        self.scale = scale
+
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, **kwargs)
+
+class PreNormAttn(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, context_size, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, context_size, **kwargs)
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        heads = 8,
+        dim_head = 64,
+        dropout = 0.,
+        context_size = 200,
+        max_pos_emb = 512
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads= heads
+        self.dim_head = dim_head
+        self.scale = dim_head ** -0.5
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim)
+
+        self.max_pos_emb = max_pos_emb
+        self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, context_size):
+        device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
+        bs, n, d = x.shape
+        assert(context_size > 0 and context_size <= max_pos_emb)
+
+        nb = n // context_size
+        nr = n % context_size
+        if nr > 0:
+            y = torch.zeros(x.shape[0], context_size-nr, x.shape[2], device=device)
+            x = torch.cat((x,y), dim=1)
+            nb += 1
+
+        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
+        q, k, v = map(lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3), (q, k, v))
+        dots = einsum('b m h i d, b m h j d -> b m h i j', q, k) * self.scale
+
+        # shaw's relative positional embedding
+        seq = torch.arange(context_size, device = device)
+        dist = seq.view(-1, 1) - seq.view(1, -1)
+        dist = torch.clamp(dist,-context_size, context_size) + max_pos_emb
+        rel_pos_emb = self.rel_pos_emb(dist).to(q)
+        pos_attn = einsum('b m h c d, c r d -> b m h c r', q, rel_pos_emb) * self.scale
+        dots = dots + pos_attn
+
+        if nr > 0:
+            mask = torch.ones(context_size, context_size, device=device)
+            mask[:nr,:nr] = 0
+            mask_value = -torch.finfo(dots.dtype).max
+            dots[:,-1,:].masked_fill_(mask.bool(), mask_value)
+
+        attn = dots.softmax(dim = -1)
+
+        out = einsum('b m h i j, b m h j d -> b m h i d', attn, v)
+        out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
+        out = self.to_out(out[:,:n,:])
+        return self.dropout(out)
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mult = 4,
+        dropout = 0.
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * mult, dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal = False,
+        expansion_factor = 2,
+        kernel_size = 31,
+        dropout = 0.):
+        super().__init__()
+
+        inner_dim = dim * expansion_factor
+        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Permute(dims=(0, 2, 1)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            nn.GLU(dim=1),
+            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
+            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
+            nn.SiLU(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Permute(dims=(0, 2, 1)),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+# Conformer Block
+
+class ConformerBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        ff_mult = 2,
+        conv_expansion_factor = 2,
+        conv_kernel_size = 31,
+        context_size = -1,
+        attn_dropout = 0.,
+        ff_dropout = 0.,
+        conv_dropout = 0.
+    ):
+        super().__init__()
+        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+        self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, context_size = context_size)
+        self.conv = ConformerConvModule(dim = dim, causal = False, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
+        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+
+        self.attn = PreNormAttn(dim, self.attn)
+        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
+        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
+
+        self.post_norm = nn.LayerNorm(dim)
+
+    def forward(self, x, context_size):
+        x = self.ff1(x) + x
+        x = self.attn(x, context_size) + x
+        x = self.conv(x) + x
+        x = self.ff2(x) + x
+        x = self.post_norm(x)
+        return x
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/encoder.py b/src/transformers/models/granite_speech/encoder.py
new file mode 100644
index 000000000000..fd6776571601
--- /dev/null
+++ b/src/transformers/models/granite_speech/encoder.py
@@ -0,0 +1,47 @@
+import types
+import torch
+import torch.nn as nn
+
+from .conformer import ConformerBlock
+from .configuration_granite_speech import GraniteSpeechEncoderConfig
+
+
+class CTCModel(nn.Module):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super(CTCModel, self).__init__()
+
+        self.rnn_trL = [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
+        for l in range(config.num_layers):
+            self.rnn_trL.append(
+                ConformerBlock(
+                    dim=config.hidden_dim,
+                    dim_head=config.dim_head,
+                    heads=config.num_heads,
+                    ff_mult=config.feedforward_mult,
+                    conv_expansion_factor=config.conv_expansion_factor,
+                    conv_kernel_size=config.conv_kernel_size,
+                    context_size=config.context_size,  # attention context size
+                    attn_dropout=config.dropout,
+                    ff_dropout=config.dropout,
+                    conv_dropout=config.dropout,
+                )
+            )
+            self.rnn_tr = nn.Sequential(*self.rnn_trL)
+
+        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
+        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
+        self.context_size = config.context_size
+        self.input_dim = config.input_dim
+        self.num_layers = config.num_layers
+        self.hidden_dim = config.hidden_dim
+        self.output_dim = config.output_dim
+
+    def forward(self, x: torch.Tensor):
+        x = self.rnn_trL[0](x)
+        for l in range(1, self.num_layers + 1):
+            x = self.rnn_trL[l](x, self.context_size)
+            if l == self.num_layers // 2:
+                x_mid = x.clone()
+                x_mid = self.out(x_mid)
+                x += self.out_mid(nn.Softmax(dim=-1)(x_mid))
+        return x
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
new file mode 100644
index 000000000000..f173c97f2d34
--- /dev/null
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for Speech Granite
+"""
+from typing import Optional
+import math
+import torch
+import torchaudio # TODO - this needs to be handled as an optional dependency
+from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+# TODO - should this be a SequenceFeatureExtractor?
+class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
+    model_input_names = ["input_features"]
+
+    def __init__(
+        self,
+        feature_size=0,
+        sampling_rate=16000,
+        padding_value=0,
+        n_fft=512,
+        win_length=400,
+        hop_length=160,
+        n_mels=80,
+        projector_window_size=15,
+        projector_downsample_rate=5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.melspec_kwargs = {
+            "sample_rate": sampling_rate,
+            "n_fft": n_fft,
+            "win_length": win_length,
+            "hop_length": hop_length,
+            "n_mels": n_mels
+        }
+        # HACK - for now, lazily initialize the mel spectrogram transform;
+        # the feature extractor mixin explodes otherwise because
+        # it tries to log the feature extractor, and the melspectrogram
+        # transform isn't json serializable...
+        self.melspec = None
+        self.projector_window_size = projector_window_size
+        self.projector_downsample_rate = projector_downsample_rate
+
+    def _ensure_melspec_transform_is_initialized(self):
+        if self.melspec is None:
+            self.melspec = torchaudio.transforms.MelSpectrogram(
+                **self.melspec_kwargs
+            )
+
+    def __call__(
+        self,
+        x: torch.Tensor,
+        device: Optional[str]="cpu",
+    ) -> BatchFeature:
+        # TODO there is probably a better way to do both of these things...
+        self._ensure_melspec_transform_is_initialized()
+        if device is not None:
+            melspec = self.melspec.to(device)
+            x = x.to(device)
+        else:
+            melspec = self.melspec
+
+        B, _ = x.shape
+        with torch.no_grad():
+            mel = melspec(x.float())
+            logmel = mel.transpose(-1,-2).clip_(min=1e-10).log10_()
+            mx = logmel.amax(dim=(-2,-1), keepdim=True)
+            logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
+            if logmel.shape[1] % 2 == 1:
+                logmel = logmel[:,:-1]                       # remove last frame if odd
+            x = logmel.reshape(B, -1, 2 * logmel.shape[-1])  # stacking and skipping by 2
+
+        if x.device != "cpu":
+            return x.detach().cpu()
+        return x
+
+    def _get_num_audio_features(self, logmel: BatchFeature) -> int:
+        """Gets the (variable length) variable length number of features
+        (i.e., projector output) for the sequence being considered.
+        """
+        # todo: (Avihu) maybe it's better to return a list (length of each sample in the batch)
+        seq_len = logmel.shape[1]
+        nblocks = math.ceil(seq_len / self.projector_window_size)
+        
+        return nblocks * self.projector_window_size // self.projector_downsample_rate
+        
+__all__ = ["GraniteSpeechFeatureExtractor"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
new file mode 100644
index 000000000000..06b7250e79c1
--- /dev/null
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -0,0 +1,150 @@
+from typing import List, Optional
+
+import torch
+import torch.utils.checkpoint
+
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.granite import GraniteForCausalLM
+from .configuration_granite_speech import GraniteSpeechConfig
+from .projector import EncoderProjectorQFormer
+from .encoder import CTCModel
+
+from peft import get_peft_model, LoraConfig, TaskType
+import time
+
+
+class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__(config)
+
+        self.llm = GraniteForCausalLM.from_pretrained(config.llm_name)
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=True,
+            r=config.lora_r,
+            lora_alpha=config.lora_alpha,
+            target_modules=config.lora_modules,
+        )
+        self.llm = get_peft_model(self.llm, peft_config)
+
+        self.encoder = CTCModel(config.encoder_config)
+
+        self.projector = EncoderProjectorQFormer(config.projector_config)
+
+        encoder_state_dict = torch.load(
+            "data/encoder.pt", map_location="cpu", weights_only=True
+        )
+        print(self.encoder.load_state_dict(encoder_state_dict, strict=False))
+
+        lora_state_dict = torch.load(
+            "data/lora_adapter.pt", map_location="cpu", weights_only=True
+        )
+        self.llm.load_state_dict(lora_state_dict, strict=False)
+
+        projector_state_dict = torch.load(
+            "data/projector.pt", map_location="cpu", weights_only=True
+        )
+        self.projector.load_state_dict(projector_state_dict, strict=True)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        input_features: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        feature_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ): 
+        if inputs_embeds is None:
+            inputs_embeds = self.prepare_inputs_for_generation(
+                input_ids=input_ids,
+                input_features=input_features,
+                attention_mask=attention_mask,
+            )
+        llm_outputs = self.llm(inputs_embeds=inputs_embeds, 
+                               attention_mask=attention_mask,
+                               past_key_values=past_key_values,
+                               position_ids=position_ids,
+                               labels=labels, 
+                               use_cache=use_cache,
+                               output_attentions=output_attentions,
+                               output_hidden_states=output_hidden_states, 
+                               return_dict=return_dict,
+
+                               )
+        return llm_outputs
+
+    def generate(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        input_features=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.prepare_inputs_for_generation(
+                input_ids=input_ids,
+                input_features=input_features,
+                attention_mask=attention_mask,
+            )
+        model_outputs = self.llm.generate(
+            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs
+        )
+        return model_outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        input_features=None,
+        **kwargs,
+    ):
+        a = time.time()
+        encoder_embeds = self.encoder(input_features)
+        print("Encoder", time.time() - a, "secs")
+
+        a = time.time()
+        projected_embeds = self.projector(encoder_embeds, None)
+        print("Projector", time.time() - a, "secs")
+
+        a = time.time()
+        # concatenate embeddings and invoke LLM generate
+        # tokenizer.vocab[self_processor.audio_token]
+        combined_embeds = self.get_merged_audio_embeddings(
+            input_ids=input_ids,
+            audio_features=projected_embeds,
+        )
+        return combined_embeds
+
+    def get_merged_audio_embeddings(self, input_ids, audio_features):
+        """
+        Adds the audio token to the model's LLM vocabulary so that we can pass it
+        through the tokenizer; it's assumed that the embeddings corresponding to the
+        <|audio|> token will be clobbered with speech features.
+
+        TODO - This needs to be adapted to handle batches of variable length sequences
+        and potentially labels.
+        """
+        is_audio_index = input_ids == self.config.audio_token_index
+        llm_input_ids = torch.where(is_audio_index, 0, input_ids)
+        inputs_embeds = self.llm.get_input_embeddings()(
+            llm_input_ids
+        )  # [bsz, # features, hidden size]
+
+        # Mask the audio features into the text embeddings
+        special_audio_mask = is_audio_index.unsqueeze(-1)
+        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            special_audio_mask,
+            audio_features,
+        )
+        return inputs_embeds
+
+__all__ = ["GraniteSpeechForConditionalGeneration"]
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
new file mode 100644
index 000000000000..f531e352e7cb
--- /dev/null
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Speech Granite.
+"""
+from typing import List, Union
+
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils import PreTokenizedInput, TextInput
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteSpeechProcessor(ProcessorMixin):
+
+    attributes = ["feature_extractor", "tokenizer"]
+    valid_kwargs = ["audio_token"]
+
+    feature_extractor_class = "GraniteSpeechFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        feature_extractor,
+        tokenizer,
+        audio_token="<|audio|>",
+    ):
+        self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audios: Union[np.ndarray, List[np.ndarray]] = None,
+        device: str = "cpu",
+        **kwargs,
+    ) -> BatchFeature:
+
+        if text is None and audios is None:
+            raise ValueError("You have to provide audio or text")
+
+        speech_inputs = {}
+        text_inputs = {}
+
+        text = self._get_validated_text(text)
+        audios = self._get_validated_audios(audios)
+        # TODO: assert that len(audios) == count(audio_token, text)
+        
+        if audios is not None:
+            # Calculate Mel features & the number of placeholders we will need
+            speech_inputs["input_features"] = self.feature_extractor(
+                audios,
+                device=device,
+            )
+            num_audio_features = self.feature_extractor._get_num_audio_features(
+                speech_inputs["input_features"],
+            )
+
+            # duplicate the audio placeholders to match the feature dims
+            text = self._expand_audio_placeholders(text, num_audio_features)
+
+        if text is not None:
+            text_inputs = self.tokenizer(text, **kwargs)
+        return BatchFeature(data={**text_inputs, **speech_inputs})
+
+    def _expand_audio_placeholders(self, text: list[str], num_audio_features: int):
+        """
+        Expands audio placeholders in the formatted text to match the number of
+        features of the corresponding embeddings; we can use the resulting text
+        to conveniently mask the audio features into the text embeddings.
+        """
+        prompt_strings = []
+        for sample in text:
+            while self.audio_token in sample:
+                # todo: (Avihu): this assumes all audios have the same length.
+                sample = sample.replace(self.audio_token, "<placeholder>" * num_audio_features, 1)
+                prompt_strings.append(sample)
+            prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
+        return prompt_strings
+
+    ##### Validation
+    def _get_validated_text(self, text: Union[str, list]) -> List[str]:
+        if isinstance(text, str):
+            return [text]
+        elif isinstance(text, list) and isinstance(text[0], str):
+            return text
+        raise TypeError("Invalid text provided! Text should be a string or list of strings.")
+
+    def _get_validated_audios(self, audios):
+        # todo: if this is a list, collate and keep track of audio lengths
+        if audios is not None and not isinstance(audios, torch.Tensor):
+            raise TypeError("Invalid audios provided! Audio should be a torch tensor.")
+        return audios
+
+
+__all__ = ["GraniteSpeechProcessor"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/projector.py b/src/transformers/models/granite_speech/projector.py
new file mode 100644
index 000000000000..0f56d1a931b5
--- /dev/null
+++ b/src/transformers/models/granite_speech/projector.py
@@ -0,0 +1,38 @@
+import torch
+import torch.nn as nn
+from .configuration_granite_speech import GraniteSpeechConfig
+from transformers import Blip2QFormerModel
+import math
+
+class EncoderProjectorQFormer(nn.Module):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.ds_rate = config.downsample_rate
+        self.window_size = config.window_size
+        self.num_queries = self.window_size // self.ds_rate
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.hidden_size))
+        self.query.data.normal_(mean=0.0, std=1.0)
+        self.qformer = Blip2QFormerModel(config)
+        self.linear = nn.Linear(config.hidden_size, config.llm_dim)
+
+    def forward(self, x, atts):
+        batch_size, seq_len, dim = x.size()
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        x = nn.functional.pad(x, (0, 0, 0, pad), "constant", 0)
+        x = x.view(batch_size * nblocks, self.window_size, dim)
+
+        query_output = self.qformer(
+            query_embeds=self.query.data,
+            encoder_hidden_states=x,
+            encoder_attention_mask=atts,
+            return_dict=True,
+        )
+        query_proj = self.linear(
+            query_output.last_hidden_state.view(
+                batch_size, nblocks * self.window_size // self.ds_rate, -1
+            )
+        )
+
+        return query_proj
\ No newline at end of file

From 3bdd91a2f435bfd5c54328368338ccd920befcb0 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Fri, 14 Mar 2025 22:06:38 +0000
Subject: [PATCH 002/116] Combine into one model file with causal lm outputs
 for forward

---
 .../models/granite_speech/conformer.py        | 210 --------
 .../models/granite_speech/encoder.py          |  47 --
 .../granite_speech/modeling_granite_speech.py | 458 +++++++++++++++---
 .../models/granite_speech/projector.py        |  38 --
 4 files changed, 400 insertions(+), 353 deletions(-)
 delete mode 100644 src/transformers/models/granite_speech/conformer.py
 delete mode 100644 src/transformers/models/granite_speech/encoder.py
 delete mode 100644 src/transformers/models/granite_speech/projector.py

diff --git a/src/transformers/models/granite_speech/conformer.py b/src/transformers/models/granite_speech/conformer.py
deleted file mode 100644
index 563297fd8827..000000000000
--- a/src/transformers/models/granite_speech/conformer.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# From https://github.com/lucidrains/conformer.git
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-
-# helper functions
-
-def calc_same_padding(kernel_size):
-    pad = kernel_size // 2
-    return (pad, pad - (kernel_size + 1) % 2)
-
-
-class Permute(nn.Module):
-    def __init__(self, dims):
-        super().__init__()
-        self.dims = dims
-        
-    def forward(self, x):
-        x = x.permute(self.dims)
-        return x
-
-
-# helper classes
-
-class DepthWiseConv1d(nn.Module):
-    def __init__(self, chan_in, chan_out, kernel_size, padding):
-        super().__init__()
-        self.padding = padding
-        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in, bias=False)
-
-    def forward(self, x):
-        x = F.pad(x, self.padding)
-        return self.conv(x)
-
-# attention, feedforward, and conv module
-
-class Scale(nn.Module):
-    def __init__(self, scale, fn):
-        super().__init__()
-        self.fn = fn
-        self.scale = scale
-
-    def forward(self, x, **kwargs):
-        return self.fn(x, **kwargs) * self.scale
-
-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.fn = fn
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x, **kwargs):
-        x = self.norm(x)
-        return self.fn(x, **kwargs)
-
-class PreNormAttn(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.fn = fn
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x, context_size, **kwargs):
-        x = self.norm(x)
-        return self.fn(x, context_size, **kwargs)
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        heads = 8,
-        dim_head = 64,
-        dropout = 0.,
-        context_size = 200,
-        max_pos_emb = 512
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        self.heads= heads
-        self.dim_head = dim_head
-        self.scale = dim_head ** -0.5
-        self.to_q = nn.Linear(dim, inner_dim, bias = False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
-        self.to_out = nn.Linear(inner_dim, dim)
-
-        self.max_pos_emb = max_pos_emb
-        self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
-
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x, context_size):
-        device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
-        bs, n, d = x.shape
-        assert(context_size > 0 and context_size <= max_pos_emb)
-
-        nb = n // context_size
-        nr = n % context_size
-        if nr > 0:
-            y = torch.zeros(x.shape[0], context_size-nr, x.shape[2], device=device)
-            x = torch.cat((x,y), dim=1)
-            nb += 1
-
-        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
-        q, k, v = map(lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3), (q, k, v))
-        dots = einsum('b m h i d, b m h j d -> b m h i j', q, k) * self.scale
-
-        # shaw's relative positional embedding
-        seq = torch.arange(context_size, device = device)
-        dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist,-context_size, context_size) + max_pos_emb
-        rel_pos_emb = self.rel_pos_emb(dist).to(q)
-        pos_attn = einsum('b m h c d, c r d -> b m h c r', q, rel_pos_emb) * self.scale
-        dots = dots + pos_attn
-
-        if nr > 0:
-            mask = torch.ones(context_size, context_size, device=device)
-            mask[:nr,:nr] = 0
-            mask_value = -torch.finfo(dots.dtype).max
-            dots[:,-1,:].masked_fill_(mask.bool(), mask_value)
-
-        attn = dots.softmax(dim = -1)
-
-        out = einsum('b m h i j, b m h j d -> b m h i d', attn, v)
-        out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
-        out = self.to_out(out[:,:n,:])
-        return self.dropout(out)
-
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim,
-        mult = 4,
-        dropout = 0.
-    ):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(dim, dim * mult),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim * mult, dim),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-class ConformerConvModule(nn.Module):
-    def __init__(
-        self,
-        dim,
-        causal = False,
-        expansion_factor = 2,
-        kernel_size = 31,
-        dropout = 0.):
-        super().__init__()
-
-        inner_dim = dim * expansion_factor
-        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
-
-        self.net = nn.Sequential(
-            nn.LayerNorm(dim),
-            Permute(dims=(0, 2, 1)),
-            nn.Conv1d(dim, inner_dim * 2, 1),
-            nn.GLU(dim=1),
-            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
-            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
-            nn.SiLU(),
-            nn.Conv1d(inner_dim, dim, 1),
-            Permute(dims=(0, 2, 1)),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-# Conformer Block
-
-class ConformerBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head = 64,
-        heads = 8,
-        ff_mult = 2,
-        conv_expansion_factor = 2,
-        conv_kernel_size = 31,
-        context_size = -1,
-        attn_dropout = 0.,
-        ff_dropout = 0.,
-        conv_dropout = 0.
-    ):
-        super().__init__()
-        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
-        self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, context_size = context_size)
-        self.conv = ConformerConvModule(dim = dim, causal = False, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
-        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
-
-        self.attn = PreNormAttn(dim, self.attn)
-        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
-        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
-
-        self.post_norm = nn.LayerNorm(dim)
-
-    def forward(self, x, context_size):
-        x = self.ff1(x) + x
-        x = self.attn(x, context_size) + x
-        x = self.conv(x) + x
-        x = self.ff2(x) + x
-        x = self.post_norm(x)
-        return x
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/encoder.py b/src/transformers/models/granite_speech/encoder.py
deleted file mode 100644
index fd6776571601..000000000000
--- a/src/transformers/models/granite_speech/encoder.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import types
-import torch
-import torch.nn as nn
-
-from .conformer import ConformerBlock
-from .configuration_granite_speech import GraniteSpeechEncoderConfig
-
-
-class CTCModel(nn.Module):
-    def __init__(self, config: GraniteSpeechEncoderConfig):
-        super(CTCModel, self).__init__()
-
-        self.rnn_trL = [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
-        for l in range(config.num_layers):
-            self.rnn_trL.append(
-                ConformerBlock(
-                    dim=config.hidden_dim,
-                    dim_head=config.dim_head,
-                    heads=config.num_heads,
-                    ff_mult=config.feedforward_mult,
-                    conv_expansion_factor=config.conv_expansion_factor,
-                    conv_kernel_size=config.conv_kernel_size,
-                    context_size=config.context_size,  # attention context size
-                    attn_dropout=config.dropout,
-                    ff_dropout=config.dropout,
-                    conv_dropout=config.dropout,
-                )
-            )
-            self.rnn_tr = nn.Sequential(*self.rnn_trL)
-
-        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
-        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
-        self.context_size = config.context_size
-        self.input_dim = config.input_dim
-        self.num_layers = config.num_layers
-        self.hidden_dim = config.hidden_dim
-        self.output_dim = config.output_dim
-
-    def forward(self, x: torch.Tensor):
-        x = self.rnn_trL[0](x)
-        for l in range(1, self.num_layers + 1):
-            x = self.rnn_trL[l](x, self.context_size)
-            if l == self.num_layers // 2:
-                x_mid = x.clone()
-                x_mid = self.out(x_mid)
-                x += self.out_mid(nn.Softmax(dim=-1)(x_mid))
-        return x
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 06b7250e79c1..e5f8ccacf9ef 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1,24 +1,329 @@
-from typing import List, Optional
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union, Tuple
 
 import torch
 import torch.utils.checkpoint
+from torch import nn, einsum
+import torch.nn.functional as F
 
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+
+from transformers import Blip2QFormerModel
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.granite import GraniteForCausalLM
-from .configuration_granite_speech import GraniteSpeechConfig
-from .projector import EncoderProjectorQFormer
-from .encoder import CTCModel
+from .configuration_granite_speech import (
+    GraniteSpeechConfig,
+    GraniteSpeechEncoderConfig,
+)
 
 from peft import get_peft_model, LoraConfig, TaskType
 import time
 
 
+@dataclass
+class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attention_mask: Optional[torch.FloatTensor] = None
+
+
+### Projector
+class EncoderProjectorQFormer(nn.Module):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.ds_rate = config.downsample_rate
+        self.window_size = config.window_size
+        self.num_queries = self.window_size // self.ds_rate
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.hidden_size))
+        self.query.data.normal_(mean=0.0, std=1.0)
+        self.qformer = Blip2QFormerModel(config)
+        self.linear = nn.Linear(config.hidden_size, config.llm_dim)
+
+    def forward(self, x, atts):
+        batch_size, seq_len, dim = x.size()
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        x = nn.functional.pad(x, (0, 0, 0, pad), "constant", 0)
+        x = x.view(batch_size * nblocks, self.window_size, dim)
+
+        query_output = self.qformer(
+            query_embeds=self.query.data,
+            encoder_hidden_states=x,
+            encoder_attention_mask=atts,
+            return_dict=True,
+        )
+        query_proj = self.linear(
+            query_output.last_hidden_state.view(
+                batch_size, nblocks * self.window_size // self.ds_rate, -1
+            )
+        )
+        return query_proj
+
+### Encoder
+class CTCModel(nn.Module):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super(CTCModel, self).__init__()
+
+        self.rnn_trL = [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
+        for l in range(config.num_layers):
+            self.rnn_trL.append(
+                ConformerBlock(
+                    dim=config.hidden_dim,
+                    dim_head=config.dim_head,
+                    heads=config.num_heads,
+                    ff_mult=config.feedforward_mult,
+                    conv_expansion_factor=config.conv_expansion_factor,
+                    conv_kernel_size=config.conv_kernel_size,
+                    context_size=config.context_size,  # attention context size
+                    attn_dropout=config.dropout,
+                    ff_dropout=config.dropout,
+                    conv_dropout=config.dropout,
+                )
+            )
+            self.rnn_tr = nn.Sequential(*self.rnn_trL)
+
+        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
+        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
+        self.context_size = config.context_size
+        self.input_dim = config.input_dim
+        self.num_layers = config.num_layers
+        self.hidden_dim = config.hidden_dim
+        self.output_dim = config.output_dim
+
+    def forward(self, x: torch.Tensor):
+        x = self.rnn_trL[0](x)
+        for l in range(1, self.num_layers + 1):
+            x = self.rnn_trL[l](x, self.context_size)
+            if l == self.num_layers // 2:
+                x_mid = x.clone()
+                x_mid = self.out(x_mid)
+                x += self.out_mid(nn.Softmax(dim=-1)(x_mid))
+        return x
+
+
+# NOTE: Conformer code is adapated from the following
+# https://github.com/lucidrains/conformer.git
+# helper functions
+
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+
+
+class Permute(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.dims = dims
+        
+    def forward(self, x):
+        x = x.permute(self.dims)
+        return x
+
+
+# helper classes
+
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in, bias=False)
+
+    def forward(self, x):
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+
+# attention, feedforward, and conv module
+
+class Scale(nn.Module):
+    def __init__(self, scale, fn):
+        super().__init__()
+        self.fn = fn
+        self.scale = scale
+
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, **kwargs)
+
+class PreNormAttn(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, context_size, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, context_size, **kwargs)
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        heads = 8,
+        dim_head = 64,
+        dropout = 0.,
+        context_size = 200,
+        max_pos_emb = 512
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads= heads
+        self.dim_head = dim_head
+        self.scale = dim_head ** -0.5
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim)
+
+        self.max_pos_emb = max_pos_emb
+        self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, context_size):
+        device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
+        bs, n, d = x.shape
+        assert(context_size > 0 and context_size <= max_pos_emb)
+
+        nb = n // context_size
+        nr = n % context_size
+        if nr > 0:
+            y = torch.zeros(x.shape[0], context_size-nr, x.shape[2], device=device)
+            x = torch.cat((x,y), dim=1)
+            nb += 1
+
+        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
+        q, k, v = map(lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3), (q, k, v))
+        dots = einsum('b m h i d, b m h j d -> b m h i j', q, k) * self.scale
+
+        # shaw's relative positional embedding
+        seq = torch.arange(context_size, device = device)
+        dist = seq.view(-1, 1) - seq.view(1, -1)
+        dist = torch.clamp(dist,-context_size, context_size) + max_pos_emb
+        rel_pos_emb = self.rel_pos_emb(dist).to(q)
+        pos_attn = einsum('b m h c d, c r d -> b m h c r', q, rel_pos_emb) * self.scale
+        dots = dots + pos_attn
+
+        if nr > 0:
+            mask = torch.ones(context_size, context_size, device=device)
+            mask[:nr,:nr] = 0
+            mask_value = -torch.finfo(dots.dtype).max
+            dots[:,-1,:].masked_fill_(mask.bool(), mask_value)
+
+        attn = dots.softmax(dim = -1)
+
+        out = einsum('b m h i j, b m h j d -> b m h i d', attn, v)
+        out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
+        out = self.to_out(out[:,:n,:])
+        return self.dropout(out)
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mult = 4,
+        dropout = 0.
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * mult, dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal = False,
+        expansion_factor = 2,
+        kernel_size = 31,
+        dropout = 0.):
+        super().__init__()
+
+        inner_dim = dim * expansion_factor
+        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Permute(dims=(0, 2, 1)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            nn.GLU(dim=1),
+            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
+            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
+            nn.SiLU(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Permute(dims=(0, 2, 1)),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+# Conformer Block
+
+class ConformerBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        ff_mult = 2,
+        conv_expansion_factor = 2,
+        conv_kernel_size = 31,
+        context_size = -1,
+        attn_dropout = 0.,
+        ff_dropout = 0.,
+        conv_dropout = 0.
+    ):
+        super().__init__()
+        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+        self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, context_size = context_size)
+        self.conv = ConformerConvModule(dim = dim, causal = False, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
+        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+
+        self.attn = PreNormAttn(dim, self.attn)
+        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
+        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
+
+        self.post_norm = nn.LayerNorm(dim)
+
+    def forward(self, x, context_size):
+        x = self.ff1(x) + x
+        x = self.attn(x, context_size) + x
+        x = self.conv(x) + x
+        x = self.ff2(x) + x
+        x = self.post_norm(x)
+        return x
+
+
+
 class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    _supports_cache_class = True
     def __init__(self, config: GraniteSpeechConfig):
         super().__init__(config)
 
-        self.llm = GraniteForCausalLM.from_pretrained(config.llm_name)
+        self.language_model = GraniteForCausalLM.from_pretrained(config.llm_name)
+        # TODO - See if we can use lora layers or this can be moved out to a conditional wrapper
         peft_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             inference_mode=True,
@@ -26,26 +331,40 @@ def __init__(self, config: GraniteSpeechConfig):
             lora_alpha=config.lora_alpha,
             target_modules=config.lora_modules,
         )
-        self.llm = get_peft_model(self.llm, peft_config)
+        self.language_model = get_peft_model(self.language_model, peft_config)
 
+        # TODO - move all of this stuff out
         self.encoder = CTCModel(config.encoder_config)
-
         self.projector = EncoderProjectorQFormer(config.projector_config)
-
         encoder_state_dict = torch.load(
             "data/encoder.pt", map_location="cpu", weights_only=True
         )
-        print(self.encoder.load_state_dict(encoder_state_dict, strict=False))
+        self.encoder.load_state_dict(encoder_state_dict, strict=False)
 
         lora_state_dict = torch.load(
             "data/lora_adapter.pt", map_location="cpu", weights_only=True
         )
-        self.llm.load_state_dict(lora_state_dict, strict=False)
+        self.language_model.load_state_dict(lora_state_dict, strict=False)
 
         projector_state_dict = torch.load(
             "data/projector.pt", map_location="cpu", weights_only=True
         )
         self.projector.load_state_dict(projector_state_dict, strict=True)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def get_audio_features(self, input_features):
+        # TODO - remove timers, keeping them for now to ensure we don't
+        # add a ton of extra latency while we are porting the model.
+        a = time.time()
+        encoder_embeds = self.encoder(input_features)
+        print("Encoder", time.time() - a, "secs")
+
+        a = time.time()
+        projected_embeds = self.projector(encoder_embeds, None)
+        return projected_embeds
 
     def forward(
         self,
@@ -61,67 +380,90 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ): 
-        if inputs_embeds is None:
-            inputs_embeds = self.prepare_inputs_for_generation(
-                input_ids=input_ids,
-                input_features=input_features,
-                attention_mask=attention_mask,
-            )
-        llm_outputs = self.llm(inputs_embeds=inputs_embeds, 
-                               attention_mask=attention_mask,
-                               past_key_values=past_key_values,
-                               position_ids=position_ids,
-                               labels=labels, 
-                               use_cache=use_cache,
-                               output_attentions=output_attentions,
-                               output_hidden_states=output_hidden_states, 
-                               return_dict=return_dict,
-
-                               )
-        return llm_outputs
-
-    def generate(
-        self,
-        input_ids,
-        inputs_embeds=None,
-        input_features=None,
-        attention_mask=None,
-        **kwargs,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
     ):
+        # Similar to llava; if we have input IDs, we encode them into embeddings.
+        # On the first pass, we should have no input embeddings, and only input features,
+        # since we need to encode the features into the LLM's embedded output.
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if input_features is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         if inputs_embeds is None:
-            inputs_embeds = self.prepare_inputs_for_generation(
+            # Get the base embeddings; set all audio tokens to 0 index
+            # to avoid out of vocabulary issues with the LLM embedding.
+            # Audio features will be masked into is_audio_idx indices later.
+            is_audio_idx = input_ids == self.config.audio_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[is_audio_idx] = 0
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if input_features is not None:
+            # Get the audio features from the encoder / projector 
+            audio_features = self.get_audio_features(input_features)
+
+            # Merge the audio features into the LLM embeddings
+            inputs_embeds = self.get_merged_audio_embeddings(
                 input_ids=input_ids,
-                input_features=input_features,
-                attention_mask=attention_mask,
+                audio_features=audio_features,
             )
-        model_outputs = self.llm.generate(
-            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds, 
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+            labels=labels, 
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states, 
+            return_dict=return_dict,
         )
-        return model_outputs
+
+        logits = outputs[0]
+
+        return GraniteSpeechCausalLMOutputWithPast(
+            loss=None, # TODO 
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
 
     def prepare_inputs_for_generation(
         self,
         input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
         input_features=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
         **kwargs,
     ):
-        a = time.time()
-        encoder_embeds = self.encoder(input_features)
-        print("Encoder", time.time() - a, "secs")
-
-        a = time.time()
-        projected_embeds = self.projector(encoder_embeds, None)
-        print("Projector", time.time() - a, "secs")
+        # Overwritten -- in specific circumstances we don't want to forward audio inputs to the model
 
-        a = time.time()
-        # concatenate embeddings and invoke LLM generate
-        # tokenizer.vocab[self_processor.audio_token]
-        combined_embeds = self.get_merged_audio_embeddings(
-            input_ids=input_ids,
-            audio_features=projected_embeds,
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
         )
-        return combined_embeds
+
+        # If we're in cached decoding stage, pixel values should be None because input ids
+        # do not contain special audio token anymore Otherwise we need input feature values
+        # to be passed to the model
+        if cache_position[0] == 0:
+            model_inputs["input_features"] = input_features
+        return model_inputs
 
     def get_merged_audio_embeddings(self, input_ids, audio_features):
         """
@@ -134,7 +476,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features):
         """
         is_audio_index = input_ids == self.config.audio_token_index
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
-        inputs_embeds = self.llm.get_input_embeddings()(
+        inputs_embeds = self.language_model.get_input_embeddings()(
             llm_input_ids
         )  # [bsz, # features, hidden size]
 
diff --git a/src/transformers/models/granite_speech/projector.py b/src/transformers/models/granite_speech/projector.py
deleted file mode 100644
index 0f56d1a931b5..000000000000
--- a/src/transformers/models/granite_speech/projector.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-import torch.nn as nn
-from .configuration_granite_speech import GraniteSpeechConfig
-from transformers import Blip2QFormerModel
-import math
-
-class EncoderProjectorQFormer(nn.Module):
-    def __init__(self, config: GraniteSpeechConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.ds_rate = config.downsample_rate
-        self.window_size = config.window_size
-        self.num_queries = self.window_size // self.ds_rate
-        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.hidden_size))
-        self.query.data.normal_(mean=0.0, std=1.0)
-        self.qformer = Blip2QFormerModel(config)
-        self.linear = nn.Linear(config.hidden_size, config.llm_dim)
-
-    def forward(self, x, atts):
-        batch_size, seq_len, dim = x.size()
-        nblocks = math.ceil(seq_len / self.window_size)
-        pad = nblocks * self.window_size - seq_len
-        x = nn.functional.pad(x, (0, 0, 0, pad), "constant", 0)
-        x = x.view(batch_size * nblocks, self.window_size, dim)
-
-        query_output = self.qformer(
-            query_embeds=self.query.data,
-            encoder_hidden_states=x,
-            encoder_attention_mask=atts,
-            return_dict=True,
-        )
-        query_proj = self.linear(
-            query_output.last_hidden_state.view(
-                batch_size, nblocks * self.window_size // self.ds_rate, -1
-            )
-        )
-
-        return query_proj
\ No newline at end of file

From a841bc1458cc106db53886206f37fa0b832eea0f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Sat, 15 Mar 2025 20:48:13 +0000
Subject: [PATCH 003/116] Add loss calc

---
 .../granite_speech/modeling_granite_speech.py | 35 ++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index e5f8ccacf9ef..df4abd0212d5 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -380,7 +380,9 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
     ):
         # Similar to llava; if we have input IDs, we encode them into embeddings.
         # On the first pass, we should have no input embeddings, and only input features,
@@ -413,21 +415,46 @@ def forward(
             )
 
         outputs = self.language_model(
-            inputs_embeds=inputs_embeds, 
             attention_mask=attention_mask,
-            past_key_values=past_key_values,
             position_ids=position_ids,
-            labels=labels, 
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states, 
             return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            labels=labels,
+            **lm_kwargs,
         )
 
         logits = outputs[0]
 
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
         return GraniteSpeechCausalLMOutputWithPast(
-            loss=None, # TODO 
+            loss=loss
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,

From a943aed7d2ccc872058d6e47278d04d4560ea7fa Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Sun, 16 Mar 2025 05:53:29 +0000
Subject: [PATCH 004/116] Fix config loading

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
---
 .../configuration_granite_speech.py           | 35 +++++++++++--------
 .../granite_speech/modeling_granite_speech.py | 17 +++++----
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index e3814af4fb22..d86c473439c2 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,9 +1,11 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import AutoConfig
 from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
-
+from transformers.models.granite.configuration_granite import GraniteConfig
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
+    model_type = "granite_speech_encoder"
+
     def __init__(
         self,
         input_dim=160,
@@ -17,7 +19,9 @@ def __init__(
         dropout=0.1,
         conv_kernel_size=15,
         conv_expansion_factor=2,
+        **kwargs,
     ):
+        super().__init__(**kwargs)
         self.input_dim = input_dim
         self.num_layers = num_layers
         self.hidden_dim = hidden_dim
@@ -46,6 +50,7 @@ def __init__(
         cross_attention_frequency=1,
         max_position_embeddings=2048,
         use_qformer_text_input=False,
+        **kwargs,
     ):
         super().__init__(
             hidden_size=hidden_size,
@@ -56,6 +61,7 @@ def __init__(
             cross_attention_frequency=cross_attention_frequency,
             max_position_embeddings=max_position_embeddings,
             use_qformer_text_input=use_qformer_text_input,
+            **kwargs,
         )
 
         self.downsample_rate = downsample_rate
@@ -64,7 +70,7 @@ def __init__(
 
 
 class GraniteSpeechConfig(PretrainedConfig):
-    model_type = "speech_granite"
+    model_type = "granite_speech"
     # TODO - Probably should consolidate these into a single config
     sub_configs = {
         "llm_config": AutoConfig,
@@ -77,29 +83,28 @@ def __init__(
         encoder_config=None,
         llm_config=None,
         projector_config=None,
-        # TODO - need to figure out how to handle lora here / separation of peft integration with peft
-        # Keeping it here during the initial porting
-        lora_r=64,
-        lora_alpha=32,
-        lora_modules=["q_proj", "v_proj"],
         # TODO - we should use a text config here instead of the direct model, then use from_config()
         llm_name="ibm-granite/granite-3.1-8b-instruct",
         audio_token_index=49155,
         **kwargs,
     ):
-        if llm_config is None:
+        # TODO - clean this up
+        if not isinstance(llm_config, AutoConfig):
             llm_config = AutoConfig.from_pretrained(llm_name)
-        if encoder_config is None:
-            encoder_config = GraniteSpeechEncoderConfig()
-        if projector_config is None:
-            projector_config = GraniteSpeechProjectorConfig()
+
+        if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
+            encoder_config = dict() if encoder_config is None else encoder_config
+            encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
+
+        if not isinstance(projector_config, GraniteSpeechProjectorConfig):
+            projector_config = dict() if projector_config is None else projector_config
+            projector_config = GraniteSpeechProjectorConfig(**projector_config)
 
         self.encoder_config = encoder_config
         self.llm_config = llm_config
         self.projector_config = projector_config
-        self.lora_r = lora_r
-        self.lora_alpha = lora_alpha
-        self.lora_modules = lora_modules
         self.llm_name = llm_name
         self.audio_token_index = audio_token_index
         super().__init__(**kwargs)
+
+__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index df4abd0212d5..4630df1f47ce 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -7,12 +7,12 @@
 from torch import nn, einsum
 import torch.nn.functional as F
 
-from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_outputs import ModelOutput
 
-from transformers import Blip2QFormerModel
+from transformers import Blip2QFormerModel, GraniteForCausalLM
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.granite import GraniteForCausalLM
+from ..auto import AutoModelForCausalLM
 from .configuration_granite_speech import (
     GraniteSpeechConfig,
     GraniteSpeechEncoderConfig,
@@ -318,18 +318,21 @@ def forward(self, x, context_size):
 
 
 class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    config_class = GraniteSpeechConfig
     _supports_cache_class = True
+
     def __init__(self, config: GraniteSpeechConfig):
         super().__init__(config)
 
+        # self.language_model = GraniteForCausalLM.from_config(config.llm_name)
         self.language_model = GraniteForCausalLM.from_pretrained(config.llm_name)
         # TODO - See if we can use lora layers or this can be moved out to a conditional wrapper
         peft_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             inference_mode=True,
-            r=config.lora_r,
-            lora_alpha=config.lora_alpha,
-            target_modules=config.lora_modules,
+            r=64,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
         )
         self.language_model = get_peft_model(self.language_model, peft_config)
 
@@ -454,7 +457,7 @@ def forward(
             return (loss,) + output if loss is not None else output
 
         return GraniteSpeechCausalLMOutputWithPast(
-            loss=loss
+            loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,

From d459591cdf9e6c77353bfbf7c4b54da2126e63f6 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Sun, 16 Mar 2025 07:38:19 +0000
Subject: [PATCH 005/116] Split new / old loading logic

---
 .../configuration_granite_speech.py           | 17 ++++-----
 .../granite_speech/modeling_granite_speech.py | 35 ++++++++++++++++---
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index d86c473439c2..ff6cc9ee8125 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -2,6 +2,7 @@
 from transformers.models.auto import AutoConfig
 from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
 from transformers.models.granite.configuration_granite import GraniteConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
     model_type = "granite_speech_encoder"
@@ -83,14 +84,15 @@ def __init__(
         encoder_config=None,
         llm_config=None,
         projector_config=None,
-        # TODO - we should use a text config here instead of the direct model, then use from_config()
-        llm_name="ibm-granite/granite-3.1-8b-instruct",
         audio_token_index=49155,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        # TODO - clean this up
-        if not isinstance(llm_config, AutoConfig):
-            llm_config = AutoConfig.from_pretrained(llm_name)
+        if isinstance(llm_config, dict):
+            llm_config["model_type"] = llm_config["model_type"] if "model_type" in llm_config else "granite"
+            llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
+        elif llm_config is None:
+            llm_config = CONFIG_MAPPING["granite"]()
 
         if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
             encoder_config = dict() if encoder_config is None else encoder_config
@@ -100,11 +102,10 @@ def __init__(
             projector_config = dict() if projector_config is None else projector_config
             projector_config = GraniteSpeechProjectorConfig(**projector_config)
 
-        self.encoder_config = encoder_config
         self.llm_config = llm_config
+        self.encoder_config = encoder_config
         self.projector_config = projector_config
-        self.llm_name = llm_name
         self.audio_token_index = audio_token_index
-        super().__init__(**kwargs)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 __all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 4630df1f47ce..30df51f18b17 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -322,11 +322,38 @@ class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
     _supports_cache_class = True
 
     def __init__(self, config: GraniteSpeechConfig):
+        is_legacy=False
+        if is_legacy:
+            self._legacy_load(config)
+        else:
+            self._transformers_load(config)
+
+
+    def _transformers_load(self, config: GraniteSpeechConfig):
+        super().__init__(config)
+        self.language_model = AutoModelForCausalLM.from_config(config.llm_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            # Need to fix uninitialized lm head issues
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        # TODO - Add this to the exported config and a conditional generation wrapper for peft.
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=True,
+            r=64,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+        )
+        self.language_model = get_peft_model(self.language_model, peft_config)
+        self.encoder = CTCModel(config.encoder_config)
+        self.projector = EncoderProjectorQFormer(config.projector_config)
+        self.post_init()
+
+    def _legacy_load(self, config: GraniteSpeechConfig):
         super().__init__(config)
+        self.language_model = GraniteForCausalLM.from_pretrained("ibm-granite/granite-3.1-8b-instruct")
 
-        # self.language_model = GraniteForCausalLM.from_config(config.llm_name)
-        self.language_model = GraniteForCausalLM.from_pretrained(config.llm_name)
-        # TODO - See if we can use lora layers or this can be moved out to a conditional wrapper
         peft_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             inference_mode=True,
@@ -336,7 +363,6 @@ def __init__(self, config: GraniteSpeechConfig):
         )
         self.language_model = get_peft_model(self.language_model, peft_config)
 
-        # TODO - move all of this stuff out
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
         encoder_state_dict = torch.load(
@@ -431,7 +457,6 @@ def forward(
             labels=labels,
             **lm_kwargs,
         )
-
         logits = outputs[0]
 
         loss = None

From edcffd4d00c8822b72b739da252ef0d6292f0857 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Sun, 16 Mar 2025 23:22:46 +0000
Subject: [PATCH 006/116] Use transformers integration for loading peft
 adapters

---
 .../granite_speech/modeling_granite_speech.py | 60 +++++++------------
 1 file changed, 23 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 30df51f18b17..9af97f1c80e8 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -321,48 +321,48 @@ class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
     config_class = GraniteSpeechConfig
     _supports_cache_class = True
 
-    def __init__(self, config: GraniteSpeechConfig):
-        is_legacy=False
+    def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True):
         if is_legacy:
-            self._legacy_load(config)
+            self._legacy_load(config, skip_lora)
         else:
             self._transformers_load(config)
 
-
     def _transformers_load(self, config: GraniteSpeechConfig):
         super().__init__(config)
+        # NOTE: It doesn't matter when we initialize from config, but we should be careful
+        # to make sure this does not pick up the adapter_config if in the future we use
+        # from_pretrained or something similar, since that should be set by the composite
+        # model; don't need to consider it twice
         self.language_model = AutoModelForCausalLM.from_config(config.llm_config)
 
         if self.language_model._tied_weights_keys is not None:
-            # Need to fix uninitialized lm head issues
+            # TODO - fix uninitialized lm head issues
             self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
-        # TODO - Add this to the exported config and a conditional generation wrapper for peft.
-        peft_config = LoraConfig(
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=True,
-            r=64,
-            lora_alpha=32,
-            target_modules=["q_proj", "v_proj"],
-        )
-        self.language_model = get_peft_model(self.language_model, peft_config)
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
         self.post_init()
 
-    def _legacy_load(self, config: GraniteSpeechConfig):
+    def _legacy_load(self, config: GraniteSpeechConfig, skip_lora=False):
         super().__init__(config)
         self.language_model = GraniteForCausalLM.from_pretrained("ibm-granite/granite-3.1-8b-instruct")
 
-        peft_config = LoraConfig(
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=True,
-            r=64,
-            lora_alpha=32,
-            target_modules=["q_proj", "v_proj"],
-        )
-        self.language_model = get_peft_model(self.language_model, peft_config)
+        if not skip_lora:
+            peft_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM,
+                inference_mode=True,
+                r=64,
+                lora_alpha=32,
+                target_modules=["q_proj", "v_proj"],
+            )
+            self.language_model = get_peft_model(self.language_model, peft_config)
 
+            lora_state_dict = torch.load(
+                "data/lora_adapter.pt", map_location="cpu", weights_only=True
+            )
+            self.language_model.load_state_dict(lora_state_dict, strict=False)
+        else:
+            print("Did not load lora adapters!")
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
         encoder_state_dict = torch.load(
@@ -370,11 +370,6 @@ def _legacy_load(self, config: GraniteSpeechConfig):
         )
         self.encoder.load_state_dict(encoder_state_dict, strict=False)
 
-        lora_state_dict = torch.load(
-            "data/lora_adapter.pt", map_location="cpu", weights_only=True
-        )
-        self.language_model.load_state_dict(lora_state_dict, strict=False)
-
         projector_state_dict = torch.load(
             "data/projector.pt", map_location="cpu", weights_only=True
         )
@@ -385,13 +380,7 @@ def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 
     def get_audio_features(self, input_features):
-        # TODO - remove timers, keeping them for now to ensure we don't
-        # add a ton of extra latency while we are porting the model.
-        a = time.time()
         encoder_embeds = self.encoder(input_features)
-        print("Encoder", time.time() - a, "secs")
-
-        a = time.time()
         projected_embeds = self.projector(encoder_embeds, None)
         return projected_embeds
 
@@ -413,9 +402,6 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ):
-        # Similar to llava; if we have input IDs, we encode them into embeddings.
-        # On the first pass, we should have no input embeddings, and only input features,
-        # since we need to encode the features into the LLM's embedded output.
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 

From 1f1ec318c7491e5086b73cd9c948048dea775d24 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Sun, 16 Mar 2025 23:32:31 +0000
Subject: [PATCH 007/116] Add generation wrapper for selective lora enablement

---
 .../models/granite_speech/modeling_granite_speech.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 9af97f1c80e8..7f2f1fb80acf 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -530,4 +530,16 @@ def get_merged_audio_embeddings(self, input_ids, audio_features):
         )
         return inputs_embeds
 
+    def generate(self, input_features=None, **kwargs):
+        """This model is expected to have a lora adapater, which is only
+        enabled when considering audio inputs. As such, we override generate
+        to conditionally enable / disable the lora adapter based on whether
+        or not any input features were provided.
+        """
+        if input_features is not None:
+            self.enable_adapters()
+        else:
+            self.disable_adapters()
+        return super().generate(input_features=input_features, **kwargs)
+
 __all__ = ["GraniteSpeechForConditionalGeneration"]

From b86d169b3a651ebec454e0c44b8f934c275b8a82 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 02:15:57 +0000
Subject: [PATCH 008/116] Add note for qformer encoder automodel

---
 .../configuration_granite_speech.py           | 13 +++++----
 .../granite_speech/modeling_granite_speech.py | 27 ++++++++++++-------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index ff6cc9ee8125..1c5a61f53125 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,7 +1,6 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import AutoConfig
 from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
-from transformers.models.granite.configuration_granite import GraniteConfig
 from ..auto import CONFIG_MAPPING, AutoConfig
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
@@ -94,14 +93,18 @@ def __init__(
         elif llm_config is None:
             llm_config = CONFIG_MAPPING["granite"]()
 
+        if isinstance(projector_config, dict):
+            # TODO - Make this generic after blip2qformer is moved out to its own model dir.
+            if projector_config["model_type"] != "blip_2_qformer":
+                raise ValueError("Granite speech currently requires blip2 qformer as its encoder!")
+            projector_config = Blip2QFormerConfig(**projector_config)
+        elif projector_config is None:
+            projector_config = Blip2QFormerConfig()
+
         if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
             encoder_config = dict() if encoder_config is None else encoder_config
             encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
 
-        if not isinstance(projector_config, GraniteSpeechProjectorConfig):
-            projector_config = dict() if projector_config is None else projector_config
-            projector_config = GraniteSpeechProjectorConfig(**projector_config)
-
         self.llm_config = llm_config
         self.encoder_config = encoder_config
         self.projector_config = projector_config
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 7f2f1fb80acf..f72bf1e593af 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -9,7 +9,7 @@
 
 from ...modeling_outputs import ModelOutput
 
-from transformers import Blip2QFormerModel, GraniteForCausalLM
+from transformers import Blip2QFormerModel
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from ..auto import AutoModelForCausalLM
@@ -18,9 +18,6 @@
     GraniteSpeechEncoderConfig,
 )
 
-from peft import get_peft_model, LoraConfig, TaskType
-import time
-
 
 @dataclass
 class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
@@ -42,6 +39,10 @@ def __init__(self, config: GraniteSpeechConfig):
         self.num_queries = self.window_size // self.ds_rate
         self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.hidden_size))
         self.query.data.normal_(mean=0.0, std=1.0)
+        # NOTE: It would be better to create this from config, similar to the LLM.
+        # To do this, we need to register the QFormer model into an automodel, which
+        # will require pulling it out into its own dir so that it's accessible under
+        # transformers.models.X
         self.qformer = Blip2QFormerModel(config)
         self.linear = nn.Linear(config.hidden_size, config.llm_dim)
 
@@ -109,8 +110,6 @@ def forward(self, x: torch.Tensor):
 
 # NOTE: Conformer code is adapated from the following
 # https://github.com/lucidrains/conformer.git
-# helper functions
-
 def calc_same_padding(kernel_size):
     pad = kernel_size // 2
     return (pad, pad - (kernel_size + 1) % 2)
@@ -344,7 +343,15 @@ def _transformers_load(self, config: GraniteSpeechConfig):
         self.post_init()
 
     def _legacy_load(self, config: GraniteSpeechConfig, skip_lora=False):
+        """NOTE: This should only be used for testing the model and converting;
+        we should use the other loading logic, which does NOT explicitly create
+        an encapsulated peft model, and instead handles it through the peft mixin
+        if we have an adapter config present.
+        """
         super().__init__(config)
+        from peft import get_peft_model, LoraConfig, TaskType
+        from transformers import GraniteForCausalLM
+
         self.language_model = GraniteForCausalLM.from_pretrained("ibm-granite/granite-3.1-8b-instruct")
 
         if not skip_lora:
@@ -407,7 +414,7 @@ def forward(
 
         if input_features is not None and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both input_features and inputs_embeds at the same time, and must specify either one"
             )
 
         if inputs_embeds is None:
@@ -499,9 +506,9 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        # If we're in cached decoding stage, pixel values should be None because input ids
-        # do not contain special audio token anymore Otherwise we need input feature values
-        # to be passed to the model
+        # If we're in cached decoding stage, input_features should be None because
+        # input ids do not contain special audio token anymore Otherwise we need
+        # input feature values to be passed to the model
         if cache_position[0] == 0:
             model_inputs["input_features"] = input_features
         return model_inputs

From 7ad8b24b4d4b07dfb06f76c08d9b4652ca68e298 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 02:36:22 +0000
Subject: [PATCH 009/116] Guard torch/audio imports in feature extractor

---
 .../feature_extraction_granite_speech.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index f173c97f2d34..1bcdf4f50cdc 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -17,23 +17,23 @@
 """
 from typing import Optional
 import math
-import torch
-import torchaudio # TODO - this needs to be handled as an optional dependency
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from transformers.utils import logging
-
+from transformers.utils import logging, is_torch_available, is_torchaudio_available
 
 logger = logging.get_logger(__name__)
 
-# TODO - should this be a SequenceFeatureExtractor?
+if is_torch_available():
+    import torch
+
+if is_torchaudio_available():
+    import torchaudio
+
 class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
     model_input_names = ["input_features"]
 
     def __init__(
         self,
-        feature_size=0,
         sampling_rate=16000,
-        padding_value=0,
         n_fft=512,
         win_length=400,
         hop_length=160,

From 07afe8b1423911f2f18cdf0f655dbea6a990b9d0 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 03:57:57 +0000
Subject: [PATCH 010/116] Handle granite speech autoclasses

---
 src/transformers/models/auto/configuration_auto.py      | 2 ++
 src/transformers/models/auto/feature_extraction_auto.py | 1 +
 src/transformers/models/auto/modeling_auto.py           | 1 +
 src/transformers/models/auto/processing_auto.py         | 1 +
 4 files changed, 5 insertions(+)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 43c0b3498d68..6d5677e833dd 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -145,6 +145,7 @@
         ("granitemoe", "GraniteMoeConfig"),
         ("granitemoeshared", "GraniteMoeSharedConfig"),
         ("granitevision", "LlavaNextConfig"),
+        ("granite_speech", "GraniteSpeechConfig"),
         ("graphormer", "GraphormerConfig"),
         ("grounding-dino", "GroundingDinoConfig"),
         ("groupvit", "GroupViTConfig"),
@@ -494,6 +495,7 @@
         ("granitemoe", "GraniteMoeMoe"),
         ("granitemoeshared", "GraniteMoeSharedMoe"),
         ("granitevision", "LLaVA-NeXT"),
+        ("granite_speech", "GraniteSpeech"),
         ("graphormer", "Graphormer"),
         ("grounding-dino", "Grounding DINO"),
         ("groupvit", "GroupViT"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 2067d1797f2c..86dc8703c426 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -61,6 +61,7 @@
         ("encodec", "EncodecFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
+        ("granite_speech", "GraniteSpeechFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("imagegpt", "ImageGPTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8c51d6576ef6..1c6828a48396 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -997,6 +997,7 @@
 
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 4cda1ebd19d6..864634965afe 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -66,6 +66,7 @@
         ("gemma3", "Gemma3Processor"),
         ("git", "GitProcessor"),
         ("got_ocr2", "GotOcr2Processor"),
+        ("granite_speech", "GraniteSpeechProcessor"),
         ("grounding-dino", "GroundingDinoProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("hubert", "Wav2Vec2Processor"),

From 1814333ae372f65e22e212b735856aaff1e72d19 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 04:07:40 +0000
Subject: [PATCH 011/116] Handle optional deps in package structure for granite
 speech

---
 .../models/granite_speech/__init__.py         | 67 +++++++++++++++++--
 1 file changed, 60 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index 41b94195c901..4ccd255e9c62 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -13,16 +13,69 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import _LazyModule
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_torchaudio_available,
+)
 from ...utils.import_utils import define_import_structure
 
+_import_structure = {
+    "configuration_granite_speech": [
+        "GraniteSpeechConfig",
+        "GraniteSpeechEncoderConfig",
+        "GraniteSpeechProjectorConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_granite_speech"] = [
+        "GraniteSpeechForConditionalGeneration",
+    ]
+
+try:
+    if not is_torchaudio_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_granite_speech"] = ["GraniteSpeechFeatureExtractor"]
+    _import_structure["processing_granite_speech"] = ["GraniteSpeechProcessor"]
+
+
+
 if TYPE_CHECKING:
-    # TODO - handle optional dependencies
-    from .configuration_granite_speech import *
-    from .feature_extraction_granite_speech import *
-    from .modeling_granite_speech import *
-    from .processing_granite_speech import *
+    from .configuration_granite_speech import (
+        GraniteSpeechConfig,
+        GraniteSpeechEncoderConfig,
+        GraniteSpeechProjectorConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_granite_speech import (
+            GraniteSpeechForConditionalGeneration,
+        )
+
+    try:
+        if not is_torchaudio_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_granite_speech import GraniteSpeechFeatureExtractor
+        from .processing_granite_speech import GraniteSpeechProcessor
 else:
     import sys
     _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(__name__, _file, _import_structure, module_spec=__spec__)

From e98325275881fa452f13ec908e7ad6a9aaddcf59 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 04:20:18 +0000
Subject: [PATCH 012/116] Add granite pretrained model def for init

---
 .../models/granite_speech/__init__.py            |  1 +
 .../configuration_granite_speech.py              |  3 ++-
 .../granite_speech/modeling_granite_speech.py    | 16 ++++++++++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index 4ccd255e9c62..11c4d41252ad 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -65,6 +65,7 @@
     else:
         from .modeling_granite_speech import (
             GraniteSpeechForConditionalGeneration,
+            GraniteSpeechPretrainedModel,
         )
 
     try:
diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 1c5a61f53125..b071e53ca01a 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -71,7 +71,6 @@ def __init__(
 
 class GraniteSpeechConfig(PretrainedConfig):
     model_type = "granite_speech"
-    # TODO - Probably should consolidate these into a single config
     sub_configs = {
         "llm_config": AutoConfig,
         "encoder_config": GraniteSpeechEncoderConfig,
@@ -85,6 +84,7 @@ def __init__(
         projector_config=None,
         audio_token_index=49155,
         tie_word_embeddings=True,
+        initializer_range=0.02,
         **kwargs,
     ):
         if isinstance(llm_config, dict):
@@ -109,6 +109,7 @@ def __init__(
         self.encoder_config = encoder_config
         self.projector_config = projector_config
         self.audio_token_index = audio_token_index
+        self.initializer_range = initializer_range
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 __all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index f72bf1e593af..a22431b90899 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -315,11 +315,23 @@ def forward(self, x, context_size):
         return x
 
 
-
-class GraniteSpeechForConditionalGeneration(PreTrainedModel, GenerationMixin):
+class GraniteSpeechPretrainedModel(PreTrainedModel):
     config_class = GraniteSpeechConfig
     _supports_cache_class = True
 
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class GraniteSpeechForConditionalGeneration(GraniteSpeechPretrainedModel, GenerationMixin):
     def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True):
         if is_legacy:
             self._legacy_load(config, skip_lora)

From 7b202eb94448a60b8df3085483496ea45ed74697 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 04:23:56 +0000
Subject: [PATCH 013/116] Add dummy objects for torch/torchaudio

---
 src/transformers/utils/dummy_torchaudio_objects.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/transformers/utils/dummy_torchaudio_objects.py b/src/transformers/utils/dummy_torchaudio_objects.py
index 58b01f06a8ab..73cec412119e 100644
--- a/src/transformers/utils/dummy_torchaudio_objects.py
+++ b/src/transformers/utils/dummy_torchaudio_objects.py
@@ -14,3 +14,17 @@ class MusicgenMelodyProcessor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchaudio"])
+
+
+class GraniteSpeechFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])
+
+
+class GraniteSpeechProcessor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])

From 68441487d14e08c26b7376c98f59f2dfd707a0a0 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 05:32:09 +0000
Subject: [PATCH 014/116] Add tests for granite speech processor

---
 .../test_processor_granite_speech.py          | 183 ++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 tests/models/granite_speech/test_processor_granite_speech.py

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
new file mode 100644
index 000000000000..5e8f507ed1a5
--- /dev/null
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -0,0 +1,183 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import json
+import tempfile
+import unittest
+import shutil
+
+import torch
+from transformers import AutoTokenizer, GPT2TokenizerFast
+
+from transformers.testing_utils import (
+    require_torch,
+    require_torchaudio,
+    require_torch_gpu,
+)
+from transformers.utils import is_torchaudio_available
+
+
+if is_torchaudio_available():
+    from transformers import GraniteSpeechProcessor, GraniteSpeechFeatureExtractor
+
+pytest.skip("Public models not yet available", allow_module_level=True)
+@require_torch
+@require_torchaudio
+class GraniteSpeechProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        # TODO - use the actual model path on HF hub after release.
+        self.checkpoint = "ibm-granite/granite-speech"
+        processor = GraniteSpeechProcessor.from_pretrained(self.checkpoint)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return GraniteSpeechFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        """Ensure we can save / reload a processor correctly."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = GraniteSpeechProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, GPT2TokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, GraniteSpeechFeatureExtractor)
+
+    def test_requires_audio_or_text(self):
+        """Ensure we require at audio, text, or both."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        with pytest.raises(ValueError):
+            processor(text=None, audios=None)
+
+    def test_bad_text_fails(self):
+        """Ensure we gracefully fail if text is the wrong type."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = GraniteSpeechProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        with pytest.raises(TypeError):
+            processor(text=424, audios=None)
+
+    def test_bad_nested_text_fails(self):
+        """Ensure we gracefully fail if text is the wrong nested type."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        with pytest.raises(TypeError):
+            processor(text=[424], audios=None)
+
+    def test_bad_audios_fails(self):
+        """Ensure we gracefully fail if audio is the wrong type."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        with pytest.raises(TypeError):
+            processor(text=None, audios="foo")
+
+    def test_bad_audios_fails(self):
+        """Ensure we gracefully fail if audio is the wrong nested type."""
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        with pytest.raises(TypeError):
+            processor(text=None, audios=["foo"])
+
+    def test_audio_token_filling(self):
+        """Ensure correctly handle audio token filling; this is similar to
+        the way that llava model preprocesses its image tokens, and depends
+        on the input sequences feature length.
+        """
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        vec_dims = [1, 269920]
+        wav = torch.rand(vec_dims) - .5
+
+        inputs = processor(
+            text=f"{processor.audio_token} Can you transcribe this audio?",
+            audios=wav,
+            return_tensors="pt"
+        )
+
+        # Check the number of audio tokens
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        # Make sure the number of audio tokens matches the number of features
+        num_expected_features = processor.feature_extractor._get_num_audio_features(
+            inputs["input_features"],
+        )
+        num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
+        assert num_expected_features == num_audio_tokens
+        
+
+    @require_torch_gpu
+    def test_device_override(self):
+        """Ensure that we regardless of the processing device, the tensors
+        produced are on the CPU.
+        """
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+
+        vec_dims = [1, 269920]
+        wav = torch.rand(vec_dims) - .5
+
+        inputs = processor(
+            text=f"{processor.audio_token} Can you transcribe this audio?",
+            audios=wav,
+            return_tensors="pt",
+            device="cuda",
+        )
+
+        assert inputs["input_features"].device.type == "cpu"

From 265dfb3fe4c272b40593f4eeb2a57dfac2ede21c Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 05:55:45 +0000
Subject: [PATCH 015/116] Minor formatting fixes and refactoring

---
 .../granite_speech/modeling_granite_speech.py | 99 +++++++++++--------
 1 file changed, 60 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index a22431b90899..0b83ca24a272 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -108,13 +108,7 @@ def forward(self, x: torch.Tensor):
         return x
 
 
-# NOTE: Conformer code is adapated from the following
-# https://github.com/lucidrains/conformer.git
-def calc_same_padding(kernel_size):
-    pad = kernel_size // 2
-    return (pad, pad - (kernel_size + 1) % 2)
-
-
+# NOTE: Conformer adapated from: https://github.com/lucidrains/conformer.git
 class Permute(nn.Module):
     def __init__(self, dims):
         super().__init__()
@@ -125,8 +119,6 @@ def forward(self, x):
         return x
 
 
-# helper classes
-
 class DepthWiseConv1d(nn.Module):
     def __init__(self, chan_in, chan_out, kernel_size, padding):
         super().__init__()
@@ -137,7 +129,6 @@ def forward(self, x):
         x = F.pad(x, self.padding)
         return self.conv(x)
 
-# attention, feedforward, and conv module
 
 class Scale(nn.Module):
     def __init__(self, scale, fn):
@@ -148,6 +139,7 @@ def __init__(self, scale, fn):
     def forward(self, x, **kwargs):
         return self.fn(x, **kwargs) * self.scale
 
+
 class PreNorm(nn.Module):
     def __init__(self, dim, fn):
         super().__init__()
@@ -158,6 +150,7 @@ def forward(self, x, **kwargs):
         x = self.norm(x)
         return self.fn(x, **kwargs)
 
+
 class PreNormAttn(nn.Module):
     def __init__(self, dim, fn):
         super().__init__()
@@ -168,15 +161,16 @@ def forward(self, x, context_size, **kwargs):
         x = self.norm(x)
         return self.fn(x, context_size, **kwargs)
 
+
 class Attention(nn.Module):
     def __init__(
         self,
         dim,
-        heads = 8,
-        dim_head = 64,
-        dropout = 0.,
-        context_size = 200,
-        max_pos_emb = 512
+        heads=8,
+        dim_head=64,
+        dropout=0.,
+        context_size=200,
+        max_pos_emb=512
     ):
         super().__init__()
         inner_dim = dim_head * heads
@@ -205,7 +199,10 @@ def forward(self, x, context_size):
             nb += 1
 
         q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
-        q, k, v = map(lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3), (q, k, v))
+        q, k, v = map(
+            lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3),
+            (q, k, v),
+        )
         dots = einsum('b m h i d, b m h j d -> b m h i j', q, k) * self.scale
 
         # shaw's relative positional embedding
@@ -229,12 +226,13 @@ def forward(self, x, context_size):
         out = self.to_out(out[:,:n,:])
         return self.dropout(out)
 
+
 class FeedForward(nn.Module):
     def __init__(
         self,
         dim,
-        mult = 4,
-        dropout = 0.
+        mult=4,
+        dropout=0.
     ):
         super().__init__()
         self.net = nn.Sequential(
@@ -248,25 +246,29 @@ def __init__(
     def forward(self, x):
         return self.net(x)
 
+
 class ConformerConvModule(nn.Module):
     def __init__(
         self,
         dim,
-        causal = False,
-        expansion_factor = 2,
-        kernel_size = 31,
-        dropout = 0.):
+        causal=False,
+        expansion_factor=2,
+        kernel_size=31,
+        dropout=0.):
         super().__init__()
 
         inner_dim = dim * expansion_factor
-        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+        padding = self.calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
 
         self.net = nn.Sequential(
             nn.LayerNorm(dim),
             Permute(dims=(0, 2, 1)),
             nn.Conv1d(dim, inner_dim * 2, 1),
             nn.GLU(dim=1),
-            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
+            DepthWiseConv1d(inner_dim,
+                            inner_dim,
+                            kernel_size=kernel_size,
+                            padding=padding),
             nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
             nn.SiLU(),
             nn.Conv1d(inner_dim, dim, 1),
@@ -277,28 +279,44 @@ def __init__(
     def forward(self, x):
         return self.net(x)
 
-# Conformer Block
+    @staticmethod
+    def calc_same_padding(kernel_size: int):
+        pad = kernel_size // 2
+        return (pad, pad - (kernel_size + 1) % 2)
+
 
 class ConformerBlock(nn.Module):
     def __init__(
         self,
         *,
         dim,
-        dim_head = 64,
-        heads = 8,
-        ff_mult = 2,
-        conv_expansion_factor = 2,
-        conv_kernel_size = 31,
-        context_size = -1,
-        attn_dropout = 0.,
-        ff_dropout = 0.,
-        conv_dropout = 0.
+        dim_head=64,
+        heads=8,
+        ff_mult=2,
+        conv_expansion_factor=2,
+        conv_kernel_size=31,
+        context_size=-1,
+        attn_dropout=0.,
+        ff_dropout=0.,
+        conv_dropout=0.
     ):
         super().__init__()
-        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
-        self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, context_size = context_size)
-        self.conv = ConformerConvModule(dim = dim, causal = False, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
-        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+        self.ff1 = FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+        self.attn = Attention(
+            dim=dim,
+            dim_head=dim_head,
+            heads=heads,
+            dropout=attn_dropout,
+            context_size=context_size,
+        )
+        self.conv = ConformerConvModule(
+            dim=dim,
+            causal=False,
+            expansion_factor=conv_expansion_factor,
+            kernel_size=conv_kernel_size,
+            dropout=conv_dropout,
+        )
+        self.ff2 = FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
 
         self.attn = PreNormAttn(dim, self.attn)
         self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
@@ -561,4 +579,7 @@ def generate(self, input_features=None, **kwargs):
             self.disable_adapters()
         return super().generate(input_features=input_features, **kwargs)
 
-__all__ = ["GraniteSpeechForConditionalGeneration"]
+__all__ = [
+    "GraniteSpeechForConditionalGeneration",
+    "GraniteSpeechPretrainedModel",
+]

From cff180eff4bbee4cd86569413ed0a73dd5af2940 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 06:04:53 +0000
Subject: [PATCH 016/116] Add options for falling back to config in forward

---
 .../models/granite_speech/modeling_granite_speech.py       | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 0b83ca24a272..a98c93b226e4 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -426,7 +426,6 @@ def forward(
         input_ids: torch.LongTensor = None,
         input_features: torch.FloatTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
-        feature_attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -439,6 +438,12 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
     ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 

From 61a9495c96dbc842e764e68cbb8f71a46b4925ea Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 06:20:24 +0000
Subject: [PATCH 017/116] Tentative model docstrings for granite speech

---
 .../granite_speech/modeling_granite_speech.py | 135 +++++++++++++++++-
 1 file changed, 133 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index a98c93b226e4..bfdb79e30118 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -17,16 +17,50 @@
     GraniteSpeechConfig,
     GraniteSpeechEncoderConfig,
 )
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+
+logger = logging.get_logger(__name__)
 
+_CONFIG_FOR_DOC = "GraniteSpeechConfig"
 
 @dataclass
 class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for LlavaNext causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    attention_mask: Optional[torch.FloatTensor] = None
 
 
 ### Projector
@@ -349,6 +383,82 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+GRANITE_SPEECH_START_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        input_features (`torch.FloatTensor` of shape `(batch_size, #TODO, #TODO)): 
+            The tensors corresponding to the input images. input features can be obtained using
+            [`AutoImageProcessor`]. See [`GraniteSpeechFeatureExtractor.__call__`] for details.
+            [`GraniteSpeechProcessor`] uses [`GraniteSpeechFeatureExtractor`] for processing audio.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The Granite Speech model, which consists of an audio encoder, projector, and language model.""",
+    GRANITE_SPEECH_START_DOCSTRING,
+)
 class GraniteSpeechForConditionalGeneration(GraniteSpeechPretrainedModel, GenerationMixin):
     def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True):
         if is_legacy:
@@ -421,6 +531,8 @@ def get_audio_features(self, input_features):
         projected_embeds = self.projector(encoder_embeds, None)
         return projected_embeds
 
+    @add_start_docstrings_to_model_forward(GRANITE_SPEECH_START_DOCSTRING)
+    @replace_return_docstrings(output_type=GraniteSpeechCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -437,7 +549,26 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **lm_kwargs,
-    ):
+    ) -> Union[Tuple[torch.Tensor], GraniteSpeechCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        TODO - add example for usage.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

From 88633ffe37bc033ee5ff3cedcde03e0bebbbd40d Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 07:58:11 +0000
Subject: [PATCH 018/116] Fix config type

---
 .../models/granite_speech/configuration_granite_speech.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index b071e53ca01a..1614d3bcadc1 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -97,9 +97,9 @@ def __init__(
             # TODO - Make this generic after blip2qformer is moved out to its own model dir.
             if projector_config["model_type"] != "blip_2_qformer":
                 raise ValueError("Granite speech currently requires blip2 qformer as its encoder!")
-            projector_config = Blip2QFormerConfig(**projector_config)
+            projector_config = GraniteSpeechProjectorConfig(**projector_config)
         elif projector_config is None:
-            projector_config = Blip2QFormerConfig()
+            projector_config = GraniteSpeechProjectorConfig()
 
         if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
             encoder_config = dict() if encoder_config is None else encoder_config

From a04da45ac01c28724afdfc71914bd055f3a76dc8 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 07:58:26 +0000
Subject: [PATCH 019/116] Remove legacy load

---
 .../granite_speech/modeling_granite_speech.py | 47 -------------------
 1 file changed, 47 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index bfdb79e30118..201a0222fe2a 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -461,12 +461,6 @@ def _init_weights(self, module):
 )
 class GraniteSpeechForConditionalGeneration(GraniteSpeechPretrainedModel, GenerationMixin):
     def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True):
-        if is_legacy:
-            self._legacy_load(config, skip_lora)
-        else:
-            self._transformers_load(config)
-
-    def _transformers_load(self, config: GraniteSpeechConfig):
         super().__init__(config)
         # NOTE: It doesn't matter when we initialize from config, but we should be careful
         # to make sure this does not pick up the adapter_config if in the future we use
@@ -482,47 +476,6 @@ def _transformers_load(self, config: GraniteSpeechConfig):
         self.projector = EncoderProjectorQFormer(config.projector_config)
         self.post_init()
 
-    def _legacy_load(self, config: GraniteSpeechConfig, skip_lora=False):
-        """NOTE: This should only be used for testing the model and converting;
-        we should use the other loading logic, which does NOT explicitly create
-        an encapsulated peft model, and instead handles it through the peft mixin
-        if we have an adapter config present.
-        """
-        super().__init__(config)
-        from peft import get_peft_model, LoraConfig, TaskType
-        from transformers import GraniteForCausalLM
-
-        self.language_model = GraniteForCausalLM.from_pretrained("ibm-granite/granite-3.1-8b-instruct")
-
-        if not skip_lora:
-            peft_config = LoraConfig(
-                task_type=TaskType.CAUSAL_LM,
-                inference_mode=True,
-                r=64,
-                lora_alpha=32,
-                target_modules=["q_proj", "v_proj"],
-            )
-            self.language_model = get_peft_model(self.language_model, peft_config)
-
-            lora_state_dict = torch.load(
-                "data/lora_adapter.pt", map_location="cpu", weights_only=True
-            )
-            self.language_model.load_state_dict(lora_state_dict, strict=False)
-        else:
-            print("Did not load lora adapters!")
-        self.encoder = CTCModel(config.encoder_config)
-        self.projector = EncoderProjectorQFormer(config.projector_config)
-        encoder_state_dict = torch.load(
-            "data/encoder.pt", map_location="cpu", weights_only=True
-        )
-        self.encoder.load_state_dict(encoder_state_dict, strict=False)
-
-        projector_state_dict = torch.load(
-            "data/projector.pt", map_location="cpu", weights_only=True
-        )
-        self.projector.load_state_dict(projector_state_dict, strict=True)
-        self.post_init()
-
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 

From 581bae2c700a35aa4e46b76d663ed6422b95b8d1 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 08:04:58 +0000
Subject: [PATCH 020/116] Allow non-lora variants for granite speech

---
 .../models/granite_speech/modeling_granite_speech.py   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 201a0222fe2a..ba207266f8d9 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -20,6 +20,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_peft_available,
     logging,
     replace_return_docstrings,
 )
@@ -662,10 +663,11 @@ def generate(self, input_features=None, **kwargs):
         to conditionally enable / disable the lora adapter based on whether
         or not any input features were provided.
         """
-        if input_features is not None:
-            self.enable_adapters()
-        else:
-            self.disable_adapters()
+        if is_peft_available and self._hf_peft_config_loaded:
+            if input_features is not None:
+                self.enable_adapters()
+            else:
+                self.disable_adapters()
         return super().generate(input_features=input_features, **kwargs)
 
 __all__ = [

From deb77eb8ee4ff5f02f8b59d645081accb052d1ed Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 16:40:27 +0000
Subject: [PATCH 021/116] Override weight tying for llm

---
 .../models/granite_speech/modeling_granite_speech.py       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index ba207266f8d9..164020960351 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -469,14 +469,13 @@ def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True)
         # model; don't need to consider it twice
         self.language_model = AutoModelForCausalLM.from_config(config.llm_config)
 
-        if self.language_model._tied_weights_keys is not None:
-            # TODO - fix uninitialized lm head issues
-            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
-
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
         self.post_init()
 
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 

From 674e97162ddda27e0b6b75fdaa4d5f59de25001f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 16:51:07 +0000
Subject: [PATCH 022/116] Use text config instead of llm config

---
 .../configuration_granite_speech.py              | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 1614d3bcadc1..2ff3cd8dc203 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -72,7 +72,7 @@ def __init__(
 class GraniteSpeechConfig(PretrainedConfig):
     model_type = "granite_speech"
     sub_configs = {
-        "llm_config": AutoConfig,
+        "text_config": AutoConfig,
         "encoder_config": GraniteSpeechEncoderConfig,
         "projector_config": GraniteSpeechProjectorConfig,
     }
@@ -80,18 +80,18 @@ class GraniteSpeechConfig(PretrainedConfig):
     def __init__(
         self,
         encoder_config=None,
-        llm_config=None,
+        text_config=None,
         projector_config=None,
         audio_token_index=49155,
         tie_word_embeddings=True,
         initializer_range=0.02,
         **kwargs,
     ):
-        if isinstance(llm_config, dict):
-            llm_config["model_type"] = llm_config["model_type"] if "model_type" in llm_config else "granite"
-            llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
-        elif llm_config is None:
-            llm_config = CONFIG_MAPPING["granite"]()
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "granite"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["granite"]()
 
         if isinstance(projector_config, dict):
             # TODO - Make this generic after blip2qformer is moved out to its own model dir.
@@ -105,7 +105,7 @@ def __init__(
             encoder_config = dict() if encoder_config is None else encoder_config
             encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
 
-        self.llm_config = llm_config
+        self.text_config = text_config
         self.encoder_config = encoder_config
         self.projector_config = projector_config
         self.audio_token_index = audio_token_index

From 0cbe18afcca3273f887cde61030bb94d8eb3bcb9 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 17 Mar 2025 16:52:03 +0000
Subject: [PATCH 023/116] Add output embeddings getter to fix weight tying

---
 .../models/granite_speech/modeling_granite_speech.py  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 164020960351..5e674220294b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -467,18 +467,21 @@ def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True)
         # to make sure this does not pick up the adapter_config if in the future we use
         # from_pretrained or something similar, since that should be set by the composite
         # model; don't need to consider it twice
-        self.language_model = AutoModelForCausalLM.from_config(config.llm_config)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
         self.post_init()
 
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
     def get_audio_features(self, input_features):
         encoder_embeds = self.encoder(input_features)
         projected_embeds = self.projector(encoder_embeds, None)

From cafb696336f6932d8f280c4bb344384f607b1993 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Tue, 18 Mar 2025 08:50:54 +0000
Subject: [PATCH 024/116] Fix relative imports

---
 .../configuration_granite_speech.py               |  2 +-
 .../granite_speech/modeling_granite_speech.py     | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 2ff3cd8dc203..78245b3bed84 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,7 +1,7 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import AutoConfig
 from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
-from ..auto import CONFIG_MAPPING, AutoConfig
+from transformers.models.auto import CONFIG_MAPPING, AutoConfig
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
     model_type = "granite_speech_encoder"
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 5e674220294b..3a0ca1c7911b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -7,17 +7,13 @@
 from torch import nn, einsum
 import torch.nn.functional as F
 
-from ...modeling_outputs import ModelOutput
 
 from transformers import Blip2QFormerModel
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
-from ..auto import AutoModelForCausalLM
-from .configuration_granite_speech import (
-    GraniteSpeechConfig,
-    GraniteSpeechEncoderConfig,
-)
-from ...utils import (
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.auto import AutoModelForCausalLM
+from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_peft_available,
@@ -25,6 +21,11 @@
     replace_return_docstrings,
 )
 
+from .configuration_granite_speech import (
+    GraniteSpeechConfig,
+    GraniteSpeechEncoderConfig,
+)
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GraniteSpeechConfig"

From d6f866d4096dd715824406be5dfd0851c1ae0de2 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:01:56 -0400
Subject: [PATCH 025/116] computing the number of audio features, based on the
 raw audio sequence.

---
 .../feature_extraction_granite_speech.py      | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index 1bcdf4f50cdc..79c7b18216aa 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -15,7 +15,7 @@
 """
 Feature extractor class for Speech Granite
 """
-from typing import Optional
+from typing import Optional, List
 import math
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from transformers.utils import logging, is_torch_available, is_torchaudio_available
@@ -91,14 +91,27 @@ def __call__(
             return x.detach().cpu()
         return x
 
-    def _get_num_audio_features(self, logmel: BatchFeature) -> int:
-        """Gets the (variable length) variable length number of features
-        (i.e., projector output) for the sequence being considered.
+    
+    def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
         """
-        # todo: (Avihu) maybe it's better to return a list (length of each sample in the batch)
-        seq_len = logmel.shape[1]
-        nblocks = math.ceil(seq_len / self.projector_window_size)
+        Gets the (variable length) variable length number of features
+        (i.e., projector output) for the sequences being considered.
+        """
+        hop_length = self.melspec_kwargs["hop_length"]
+        effective_window_size = self.projector_window_size // self.projector_downsample_rate
+
+        projector_lengths = []
+        for raw_length in audio_lengths:
+            # mel sequence length computation
+            mel_length = raw_length // hop_length + 1
+            # encoder frame takes two mel features
+            encoder_length = mel_length // 2
+            nblocks = math.ceil(encoder_length / self.projector_window_size)
+            # projector output length
+            projector_length = nblocks * effective_window_size
+            projector_lengths.append(projector_length)
         
-        return nblocks * self.projector_window_size // self.projector_downsample_rate
+        return projector_lengths
+
         
 __all__ = ["GraniteSpeechFeatureExtractor"]
\ No newline at end of file

From ab4cdc2731cd977572dc9a53996d7c12add0f399 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:03:48 -0400
Subject: [PATCH 026/116] collating audio inputs, and keeping the original
 lengths.

---
 .../granite_speech/processing_granite_speech.py    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index f531e352e7cb..6fe2639f8c46 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -105,9 +105,17 @@ def _get_validated_text(self, text: Union[str, list]) -> List[str]:
 
     def _get_validated_audios(self, audios):
         # todo: if this is a list, collate and keep track of audio lengths
-        if audios is not None and not isinstance(audios, torch.Tensor):
-            raise TypeError("Invalid audios provided! Audio should be a torch tensor.")
-        return audios
+        if isinstance(audios, torch.Tensor):
+            lengths = [audios.shape[-1]] * audios.shape[0]
+            return audios, lengths
+        elif isinstance(audios, list) and isinstance(audios[0], torch.Tensor):
+            lengths = [audio.shape[-1] for audio in audios]
+            padding = [max(lengths) - length for length in lengths]
+            padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
+            audios = torch.cat(padded, dim=0)
+            return audios, lengths
+        
+        raise TypeError("Invalid audio provided. Audio should be a Tensor or a list of Tensors.")
 
 
 __all__ = ["GraniteSpeechProcessor"]
\ No newline at end of file

From 2f8f57692ab54cbef7b1d86dcdd466614669332e Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:06:14 -0400
Subject: [PATCH 027/116] asserted we have text. otherwise we can't specify the
 audio special token.

---
 .../models/granite_speech/processing_granite_speech.py       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 6fe2639f8c46..9137cffd77e6 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -48,13 +48,12 @@ def __init__(
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        audios: Union[np.ndarray, List[np.ndarray]] = None,
+        audios: Union[torch.Tensor, List[torch.Tensor]] = None,
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:
 
-        if text is None and audios is None:
-            raise ValueError("You have to provide audio or text")
+        assert text is not None, "You have to provide text"
 
         speech_inputs = {}
         text_inputs = {}

From 579433389d6788603b6bcc37479bac6d4404fcd8 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:07:22 -0400
Subject: [PATCH 028/116] assering the number of audio-symbols/audios match
 correctly. running get validated_audios only when audio is present

---
 .../granite_speech/processing_granite_speech.py  | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 9137cffd77e6..c11c4a1fba9a 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -59,24 +59,28 @@ def __call__(
         text_inputs = {}
 
         text = self._get_validated_text(text)
-        audios = self._get_validated_audios(audios)
-        # TODO: assert that len(audios) == count(audio_token, text)
+        expected_num_audios = sum(t.count(self.audio_token) for t in text)
         
         if audios is not None:
+            audios, audio_lengths = self._get_validated_audios(audios)
+            if len(audio_lengths) != expected_num_audios:
+                raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
+            
             # Calculate Mel features & the number of placeholders we will need
             speech_inputs["input_features"] = self.feature_extractor(
                 audios,
                 device=device,
             )
             num_audio_features = self.feature_extractor._get_num_audio_features(
-                speech_inputs["input_features"],
+                audio_lengths
             )
 
             # duplicate the audio placeholders to match the feature dims
             text = self._expand_audio_placeholders(text, num_audio_features)
-
-        if text is not None:
-            text_inputs = self.tokenizer(text, **kwargs)
+        else:
+            assert expected_num_audios == 0, "no audio is provided, expecting no audio tokens."
+        
+        text_inputs = self.tokenizer(text, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **speech_inputs})
 
     def _expand_audio_placeholders(self, text: list[str], num_audio_features: int):

From 7dddeff6ee3a57890542ec5e41067305484b2c2a Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:08:27 -0400
Subject: [PATCH 029/116] indentation bugfix + supporting different feature
 lengths when expanding audio.

---
 .../granite_speech/processing_granite_speech.py      | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index c11c4a1fba9a..0d4c7bacf857 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -83,19 +83,21 @@ def __call__(
         text_inputs = self.tokenizer(text, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **speech_inputs})
 
-    def _expand_audio_placeholders(self, text: list[str], num_audio_features: int):
+    def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[int]):
         """
         Expands audio placeholders in the formatted text to match the number of
         features of the corresponding embeddings; we can use the resulting text
         to conveniently mask the audio features into the text embeddings.
         """
         prompt_strings = []
+        i = 0
         for sample in text:
             while self.audio_token in sample:
-                # todo: (Avihu): this assumes all audios have the same length.
-                sample = sample.replace(self.audio_token, "<placeholder>" * num_audio_features, 1)
-                prompt_strings.append(sample)
-            prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
+                sample = sample.replace(self.audio_token, "<placeholder>" * num_audio_features[i], 1)
+                i += 1
+            prompt_strings.append(sample)
+        
+        prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
         return prompt_strings
 
     ##### Validation

From bf662959187517f20d5e71106df0dcd6938e0b72 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:48:55 -0400
Subject: [PATCH 030/116] redundant, done in _get_validated_text

---
 .../models/granite_speech/processing_granite_speech.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 0d4c7bacf857..ffd05d6f4106 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -47,14 +47,12 @@ def __init__(
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         audios: Union[torch.Tensor, List[torch.Tensor]] = None,
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:
 
-        assert text is not None, "You have to provide text"
-
         speech_inputs = {}
         text_inputs = {}
 

From 42b331d605e2d7a2ef29dfe1f724ae28c9ecd7c0 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 19 Mar 2025 08:52:31 -0400
Subject: [PATCH 031/116] adapting the tests: - we must have text (not either
 audio or text) - _get_num_audio_features takes a list of raw lengths,
 provided it insetad.

---
 .../granite_speech/test_processor_granite_speech.py    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index 5e8f507ed1a5..a57ec55d3048 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -69,8 +69,8 @@ def test_save_load_pretrained_default(self):
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
         self.assertIsInstance(processor.feature_extractor, GraniteSpeechFeatureExtractor)
 
-    def test_requires_audio_or_text(self):
-        """Ensure we require at audio, text, or both."""
+    def test_requires_text(self):
+        """Ensure we require text"""
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()
         processor = GraniteSpeechProcessor(
@@ -78,8 +78,8 @@ def test_requires_audio_or_text(self):
             feature_extractor=feature_extractor,
         )
 
-        with pytest.raises(ValueError):
-            processor(text=None, audios=None)
+        with pytest.raises(TypeError):
+            processor(text=None)
 
     def test_bad_text_fails(self):
         """Ensure we gracefully fail if text is the wrong type."""
@@ -152,7 +152,7 @@ def test_audio_token_filling(self):
 
         # Make sure the number of audio tokens matches the number of features
         num_expected_features = processor.feature_extractor._get_num_audio_features(
-            inputs["input_features"],
+            vec_dims[1:],
         )
         num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
         assert num_expected_features == num_audio_tokens

From 1ada05c93c09c1242609370dad78510bb425f2c8 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Thu, 20 Mar 2025 04:58:45 +0000
Subject: [PATCH 032/116] Minor cleanup, remove unused import

---
 .../granite_speech/processing_granite_speech.py       | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index ffd05d6f4106..11a9ce804f1d 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -17,7 +17,6 @@
 """
 from typing import List, Union
 
-import numpy as np
 import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
@@ -88,11 +87,15 @@ def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[i
         to conveniently mask the audio features into the text embeddings.
         """
         prompt_strings = []
-        i = 0
+        num_replaced = 0
         for sample in text:
             while self.audio_token in sample:
-                sample = sample.replace(self.audio_token, "<placeholder>" * num_audio_features[i], 1)
-                i += 1
+                sample = sample.replace(
+                    self.audio_token,
+                    "<placeholder>" * num_audio_features[num_replaced],
+                    1,
+                )
+                num_replaced += 1
             prompt_strings.append(sample)
         
         prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]

From 5a4ece25e608c93c64e3bb8aca5a4297314c34ec Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Thu, 20 Mar 2025 04:59:37 +0000
Subject: [PATCH 033/116] Add more tests for batch feature processing

---
 .../test_processor_granite_speech.py          | 60 +++++++++++++++----
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index a57ec55d3048..1f0db5d0a5c0 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -16,6 +16,7 @@
 import tempfile
 import unittest
 import shutil
+from parameterized import parameterized
 
 import torch
 from transformers import AutoTokenizer, GPT2TokenizerFast
@@ -126,10 +127,14 @@ def test_bad_audios_fails(self):
         with pytest.raises(TypeError):
             processor(text=None, audios=["foo"])
 
-    def test_audio_token_filling(self):
-        """Ensure correctly handle audio token filling; this is similar to
-        the way that llava model preprocesses its image tokens, and depends
-        on the input sequences feature length.
+    @parameterized.expand([
+        ([1, 269920], [171]),
+        ([2, 269920], [171, 171]),
+    ])
+    def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features):
+        """Ensure audio token filling is handled correctly when we have
+        one or more audio inputs whose features are all the same length
+        stacked into a tensor.
         """
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()
@@ -137,13 +142,44 @@ def test_audio_token_filling(self):
             tokenizer=tokenizer,
             feature_extractor=feature_extractor,
         )
+        audios = torch.rand(vec_dims) - .5
 
-        vec_dims = [1, 269920]
-        wav = torch.rand(vec_dims) - .5
+        audio_tokens = processor.audio_token * vec_dims[0]
+        inputs = processor(
+            text=f"{audio_tokens} Can you compare this audio?",
+            audios=audios,
+            return_tensors="pt"
+        )
+
+        # Check the number of audio tokens
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        # Make sure the number of audio tokens matches the number of features
+        num_computed_features = processor.feature_extractor._get_num_audio_features(
+            [vec_dims[1] for _ in range(vec_dims[0])],
+        )
+        num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
+        assert sum(num_computed_features) == num_audio_tokens
+        assert num_expected_features == num_expected_features
 
+    def test_audio_token_filling_varying_len_feature_list(self):
+        """Ensure audio token filling is handled correctly when we have
+        multiple varying len audio sequences passed as a list.
+        """
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = GraniteSpeechProcessor(
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+        vec_dims = [[1, 142100], [1, 269920]]
+        num_expected_features = [90, 171]
+        audios = [torch.rand(dims) - .5 for dims in vec_dims]
+
+        audio_tokens = processor.audio_token * len(vec_dims)
         inputs = processor(
-            text=f"{processor.audio_token} Can you transcribe this audio?",
-            audios=wav,
+            text=f"{audio_tokens} Can you compare this audio?",
+            audios=audios,
             return_tensors="pt"
         )
 
@@ -151,12 +187,12 @@ def test_audio_token_filling(self):
         audio_token_id = tokenizer.get_vocab()[processor.audio_token]
 
         # Make sure the number of audio tokens matches the number of features
-        num_expected_features = processor.feature_extractor._get_num_audio_features(
-            vec_dims[1:],
+        num_calculated_features = processor.feature_extractor._get_num_audio_features(
+            [dims[1] for dims in vec_dims],
         )
         num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
-        assert num_expected_features == num_audio_tokens
-        
+        assert num_calculated_features == [90, 171]
+        assert sum(num_expected_features) == num_audio_tokens
 
     @require_torch_gpu
     def test_device_override(self):

From cd531671fe3206c88f297ab28bb175c81a67d686 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Thu, 20 Mar 2025 19:50:28 +0000
Subject: [PATCH 034/116] Allow setting offset in rel position embeddings

---
 .../granite_speech/configuration_granite_speech.py    |  3 ++-
 .../models/granite_speech/modeling_granite_speech.py  | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 78245b3bed84..2e139a9dba95 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -19,6 +19,7 @@ def __init__(
         dropout=0.1,
         conv_kernel_size=15,
         conv_expansion_factor=2,
+        use_max_pos_emb_in_pos_emb_calc=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -30,10 +31,10 @@ def __init__(
         self.dim_head = dim_head
         self.output_dim = output_dim
         self.context_size = context_size
-
         self.dropout = dropout
         self.conv_kernel_size = conv_kernel_size
         self.conv_expansion_factor = conv_expansion_factor
+        self.use_max_pos_emb_in_pos_emb_calc = use_max_pos_emb_in_pos_emb_calc
 
 
 class GraniteSpeechProjectorConfig(Blip2QFormerConfig):
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 3a0ca1c7911b..a13a1613ca4c 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -121,6 +121,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
                     attn_dropout=config.dropout,
                     ff_dropout=config.dropout,
                     conv_dropout=config.dropout,
+                    use_max_pos_emb_in_pos_emb_calc=config.use_max_pos_emb_in_pos_emb_calc,
                 )
             )
             self.rnn_tr = nn.Sequential(*self.rnn_trL)
@@ -206,7 +207,8 @@ def __init__(
         dim_head=64,
         dropout=0.,
         context_size=200,
-        max_pos_emb=512
+        max_pos_emb=512,
+        use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
         inner_dim = dim_head * heads
@@ -221,6 +223,7 @@ def __init__(
         self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
 
         self.dropout = nn.Dropout(dropout)
+        self.offset = max_pos_emb if use_max_pos_emb_in_pos_emb_calc else context_size
 
     def forward(self, x, context_size):
         device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
@@ -244,7 +247,7 @@ def forward(self, x, context_size):
         # shaw's relative positional embedding
         seq = torch.arange(context_size, device = device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist,-context_size, context_size) + max_pos_emb
+        dist = torch.clamp(dist,-context_size, context_size) + self.offset
         rel_pos_emb = self.rel_pos_emb(dist).to(q)
         pos_attn = einsum('b m h c d, c r d -> b m h c r', q, rel_pos_emb) * self.scale
         dots = dots + pos_attn
@@ -334,7 +337,8 @@ def __init__(
         context_size=-1,
         attn_dropout=0.,
         ff_dropout=0.,
-        conv_dropout=0.
+        conv_dropout=0.,
+        use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
         self.ff1 = FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
@@ -344,6 +348,7 @@ def __init__(
             heads=heads,
             dropout=attn_dropout,
             context_size=context_size,
+            use_max_pos_emb_in_pos_emb_calc=use_max_pos_emb_in_pos_emb_calc,
         )
         self.conv = ConformerConvModule(
             dim=dim,

From 6a0d62c7090117bc46dfbb71561618779b8b1726 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Thu, 20 Mar 2025 19:57:00 +0000
Subject: [PATCH 035/116] Add config option for warning if peft is not
 installed w/ lora

---
 .../granite_speech/configuration_granite_speech.py       | 2 ++
 .../models/granite_speech/modeling_granite_speech.py     | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 2e139a9dba95..541577b3ba93 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -86,6 +86,7 @@ def __init__(
         audio_token_index=49155,
         tie_word_embeddings=True,
         initializer_range=0.02,
+        has_lora_adapter=True,
         **kwargs,
     ):
         if isinstance(text_config, dict):
@@ -111,6 +112,7 @@ def __init__(
         self.projector_config = projector_config
         self.audio_token_index = audio_token_index
         self.initializer_range = initializer_range
+        self.has_lora_adapter = has_lora_adapter
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 __all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index a13a1613ca4c..359c7aff539e 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -480,6 +480,15 @@ def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True)
 
         self.encoder = CTCModel(config.encoder_config)
         self.projector = EncoderProjectorQFormer(config.projector_config)
+
+        if config.has_lora_adapter and not is_peft_available():
+            logger.warning(
+                "Config indicates that a lora adapter should be present, but "
+                "peft is not installed; this will cause the model to perform "
+                "incorrectly when audio inputs are provided. Please install "
+                "peft and reload the model!"
+            )
+
         self.post_init()
 
     def get_input_embeddings(self):

From ed413070d2d6ca4cd0c8748e3cfa2aa4faefd3a7 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Fri, 21 Mar 2025 16:15:26 +0000
Subject: [PATCH 036/116] Port blip2 qformer code into granite speech

---
 .../granite_speech/modeling_granite_speech.py | 684 +++++++++++++++++-
 1 file changed, 679 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 359c7aff539e..22e3e2c45531 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -6,12 +6,12 @@
 import torch.utils.checkpoint
 from torch import nn, einsum
 import torch.nn.functional as F
-
-
-from transformers import Blip2QFormerModel
+GraniteSpeechQFormerConfig = int
+from transformers.activations import ACT2FN
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_outputs import ModelOutput, BaseModelOutputWithPoolingAndCrossAttentions, BaseModelOutputWithPastAndCrossAttentions
 from transformers.models.auto import AutoModelForCausalLM
 from transformers.utils import (
     add_start_docstrings,
@@ -66,6 +66,680 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
 
 
 ### Projector
+# Currently, we copy the Qformer code directly to avoid depending on Blip2;
+# it would be better to create the model from config, similar to the LLM,
+# but to do this, we will need to register the QFormer model into an automodel,
+# which will should involve pulling it out into its own dir so that it is accessible
+# under transformers.models.X.
+
+# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerMultiHeadAttention with Blip2->GraniteSpeech
+class GraniteSpeechQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GraniteSpeechQFormer
+class GraniteSpeechQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerAttention with Blip2->GraniteSpeech
+class GraniteSpeechQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = GraniteSpeechQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = GraniteSpeechQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GraniteSpeechQFormer
+class GraniteSpeechQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GraniteSpeechQFormer
+class GraniteSpeechQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GraniteSpeechQFormer
+class GraniteSpeechQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerLayer with Blip2->GraniteSpeech
+class GraniteSpeechQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GraniteSpeechQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = GraniteSpeechQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        if config.use_qformer_text_input:
+            self.intermediate = GraniteSpeechQFormerIntermediate(config)
+            self.output = GraniteSpeechQFormerOutput(config)
+
+        self.intermediate_query = GraniteSpeechQFormerIntermediate(config)
+        self.output_query = GraniteSpeechQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerEncoder with Blip2->GraniteSpeech
+class GraniteSpeechQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [GraniteSpeechQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+# Copied from transformers.models.blip.modeling_blip.Blip2PreTrainedModel with Blip2->GraniteSpeechEncoderProjector
+class GraniteSpeechEncoderProjectorPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GraniteSpeechConfig
+    base_model_prefix = "qformer"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = [
+        "GraniteSpeechQFormerMultiHeadAttention",
+        "T5Block",
+        "OPTDecoderLayer",
+    ]
+    _skip_keys_device_placement = "past_key_values"
+    _keep_in_fp32_modules = ["query_tokens"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+# Copied from transformers.models.blip.modeling_blip.GraniteSpeechQFormerModel with GraniteSpeech->GraniteSpeech
+class GraniteSpeechQFormerModel(GraniteSpeechEncoderProjectorPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in GraniteSpeech.
+    """
+
+    def __init__(self, config: GraniteSpeechQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = GraniteSpeechQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        query_length: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = (
+            query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0
+        )
+
+        embedding_output = self.layernorm(query_embeds)
+        embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+# TODO (alex) - refactor GraniteSpeechQformer to be available under
+# transformers.models.X, delete all of the code above, and
+# create the model through AutoModel.
+
 class EncoderProjectorQFormer(nn.Module):
     def __init__(self, config: GraniteSpeechConfig):
         super().__init__()
@@ -79,7 +753,7 @@ def __init__(self, config: GraniteSpeechConfig):
         # To do this, we need to register the QFormer model into an automodel, which
         # will require pulling it out into its own dir so that it's accessible under
         # transformers.models.X
-        self.qformer = Blip2QFormerModel(config)
+        self.qformer = GraniteSpeechQFormerModel(config)
         self.linear = nn.Linear(config.hidden_size, config.llm_dim)
 
     def forward(self, x, atts):

From eff7982886973b86c99982e932b9913a6ecad127 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Fri, 21 Mar 2025 16:25:10 +0000
Subject: [PATCH 037/116] Add sad test for numpy arr processing

---
 .../test_processor_granite_speech.py               | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index 1f0db5d0a5c0..aee11ab3c59c 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
-import json
 import tempfile
 import unittest
 import shutil
 from parameterized import parameterized
 
+import numpy as np
 import torch
 from transformers import AutoTokenizer, GPT2TokenizerFast
 
@@ -128,13 +128,15 @@ def test_bad_audios_fails(self):
             processor(text=None, audios=["foo"])
 
     @parameterized.expand([
-        ([1, 269920], [171]),
-        ([2, 269920], [171, 171]),
+        ([1, 269920], [171], torch.rand),
+        ([2, 269920], [171, 171], torch.rand),
+        ([1, 269920], [171], np.random.rand),
+        ([2, 269920], [171, 171], np.random.rand),
     ])
-    def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features):
+    def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features, random_func):
         """Ensure audio token filling is handled correctly when we have
         one or more audio inputs whose features are all the same length
-        stacked into a tensor.
+        stacked into a tensor / numpy array.
         """
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()
@@ -142,7 +144,7 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
             tokenizer=tokenizer,
             feature_extractor=feature_extractor,
         )
-        audios = torch.rand(vec_dims) - .5
+        audios = random_func(*vec_dims) - .5
 
         audio_tokens = processor.audio_token * vec_dims[0]
         inputs = processor(

From 1f2e4da771bea7cf4c9566a059a499c04d335704 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Fri, 21 Mar 2025 17:23:44 +0000
Subject: [PATCH 038/116] Allow numpy arrays / tuples in granite speech
 processor

---
 .../granite_speech/processing_granite_speech.py    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 11a9ce804f1d..e144ddcbd772 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -15,8 +15,10 @@
 """
 Processor class for Speech Granite.
 """
+from collections.abc import Sequence
 from typing import List, Union
 
+import numpy as np
 import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
@@ -110,18 +112,24 @@ def _get_validated_text(self, text: Union[str, list]) -> List[str]:
         raise TypeError("Invalid text provided! Text should be a string or list of strings.")
 
     def _get_validated_audios(self, audios):
-        # todo: if this is a list, collate and keep track of audio lengths
+        # Coerce to PyTorch tensors if we have numpy arrays, since
+        # currently we have a dependency on torch/torchaudio anyway
+        if isinstance(audios, np.ndarray):
+            audios = torch.from_numpy(audios)
+        elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray):
+            audios = [torch.from_numpy(arr) for arr in audios]
+
         if isinstance(audios, torch.Tensor):
             lengths = [audios.shape[-1]] * audios.shape[0]
             return audios, lengths
-        elif isinstance(audios, list) and isinstance(audios[0], torch.Tensor):
+        elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
             lengths = [audio.shape[-1] for audio in audios]
             padding = [max(lengths) - length for length in lengths]
             padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
             audios = torch.cat(padded, dim=0)
             return audios, lengths
         
-        raise TypeError("Invalid audio provided. Audio should be a Tensor or a list of Tensors.")
+        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays.")
 
 
 __all__ = ["GraniteSpeechProcessor"]
\ No newline at end of file

From cb6bf4ac7d17427b22386aac3f5de6119f138fa7 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Fri, 21 Mar 2025 17:25:15 +0000
Subject: [PATCH 039/116] Fix config type for projector

---
 .../models/granite_speech/modeling_granite_speech.py         | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 22e3e2c45531..34313a4acc20 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -24,6 +24,7 @@
 from .configuration_granite_speech import (
     GraniteSpeechConfig,
     GraniteSpeechEncoderConfig,
+    GraniteSpeechProjectorConfig,
 )
 
 logger = logging.get_logger(__name__)
@@ -518,7 +519,7 @@ class GraniteSpeechEncoderProjectorPreTrainedModel(PreTrainedModel):
     models.
     """
 
-    config_class = GraniteSpeechConfig
+    config_class = GraniteSpeechProjectorConfig
     base_model_prefix = "qformer"
     supports_gradient_checkpointing = True
 
@@ -741,7 +742,7 @@ def forward(
 # create the model through AutoModel.
 
 class EncoderProjectorQFormer(nn.Module):
-    def __init__(self, config: GraniteSpeechConfig):
+    def __init__(self, config: GraniteSpeechProjectorConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.ds_rate = config.downsample_rate

From 4ca7e44795a4fb9a37ecccff1464eb04f8e9ebb5 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 06:32:35 -0400
Subject: [PATCH 040/116] - pad instead of creating a zeros tensor, to keep the
 original dtype/device (support bfloat16) - cast input_features to the model
 dtype (support bfloat16)

---
 .../granite_speech/modeling_granite_speech.py     | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 34313a4acc20..6327b8ea17ae 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -905,12 +905,11 @@ def forward(self, x, context_size):
         bs, n, d = x.shape
         assert(context_size > 0 and context_size <= max_pos_emb)
 
-        nb = n // context_size
+        nb = math.ceil(n / context_size)
         nr = n % context_size
         if nr > 0:
-            y = torch.zeros(x.shape[0], context_size-nr, x.shape[2], device=device)
-            x = torch.cat((x,y), dim=1)
-            nb += 1
+            # right padding to reach block size
+            x = torch.nn.functional.pad(x, (0, 0, 0, context_size - nr))
 
         q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
         q, k, v = map(
@@ -928,10 +927,11 @@ def forward(self, x, context_size):
         dots = dots + pos_attn
 
         if nr > 0:
-            mask = torch.ones(context_size, context_size, device=device)
+            # masked attention in the extended block
+            mask = torch.ones(context_size, context_size, dtype=bool, device=device)
             mask[:nr,:nr] = 0
             mask_value = -torch.finfo(dots.dtype).max
-            dots[:,-1,:].masked_fill_(mask.bool(), mask_value)
+            dots[:,-1,:].masked_fill_(mask, mask_value)
 
         attn = dots.softmax(dim = -1)
 
@@ -1239,6 +1239,9 @@ def forward(
             inputs_embeds = self.get_input_embeddings()(llm_input_ids)
 
         if input_features is not None:
+            if input_features.dtype != self.dtype:
+                logger.warning(f"input features are casted to {self.dtype}")
+                input_features = input_features.to(self.dtype)
             # Get the audio features from the encoder / projector 
             audio_features = self.get_audio_features(input_features)
 

From bd82de027078fe96f6c894b06a75ca51fb0c44a0 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 06:48:46 -0400
Subject: [PATCH 041/116] merge Blip2QFormerConfig to
 GraniteSpeechProjectorConfig

---
 .../configuration_granite_speech.py           | 39 ++++++++++++-------
 .../granite_speech/modeling_granite_speech.py |  3 +-
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 541577b3ba93..ff68b439c039 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,6 +1,5 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import AutoConfig
-from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
 from transformers.models.auto import CONFIG_MAPPING, AutoConfig
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
@@ -36,8 +35,8 @@ def __init__(
         self.conv_expansion_factor = conv_expansion_factor
         self.use_max_pos_emb_in_pos_emb_calc = use_max_pos_emb_in_pos_emb_calc
 
-
-class GraniteSpeechProjectorConfig(Blip2QFormerConfig):
+## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
+class GraniteSpeechProjectorConfig(PretrainedConfig):
     def __init__(
         self,
         llm_dim=4096,
@@ -50,21 +49,31 @@ def __init__(
         encoder_hidden_size=1024,
         cross_attention_frequency=1,
         max_position_embeddings=2048,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
         use_qformer_text_input=False,
         **kwargs,
     ):
-        super().__init__(
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            intermediate_size=intermediate_size,
-            num_hidden_layers=num_hidden_layers,
-            encoder_hidden_size=encoder_hidden_size,
-            cross_attention_frequency=cross_attention_frequency,
-            max_position_embeddings=max_position_embeddings,
-            use_qformer_text_input=use_qformer_text_input,
-            **kwargs,
-        )
-
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+        self.use_qformer_text_input = use_qformer_text_input
         self.downsample_rate = downsample_rate
         self.window_size = window_size
         self.llm_dim = llm_dim
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 6327b8ea17ae..8263601536fe 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -6,7 +6,6 @@
 import torch.utils.checkpoint
 from torch import nn, einsum
 import torch.nn.functional as F
-GraniteSpeechQFormerConfig = int
 from transformers.activations import ACT2FN
 from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.generation import GenerationMixin
@@ -552,7 +551,7 @@ class GraniteSpeechQFormerModel(GraniteSpeechEncoderProjectorPreTrainedModel):
     Querying Transformer (Q-Former), used in GraniteSpeech.
     """
 
-    def __init__(self, config: GraniteSpeechQFormerConfig):
+    def __init__(self, config: GraniteSpeechProjectorConfig):
         super().__init__(config)
         self.config = config
 

From f7384f6e46d90d41a92d835015035c97d34f50d5 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 06:51:07 -0400
Subject: [PATCH 042/116] prevent a crash when re-saving/loading the model
 (line 109)

---
 .../models/granite_speech/configuration_granite_speech.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index ff68b439c039..afdb2d1d67af 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -37,6 +37,7 @@ def __init__(
 
 ## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
 class GraniteSpeechProjectorConfig(PretrainedConfig):
+    model_type = "blip_2_qformer"
     def __init__(
         self,
         llm_dim=4096,

From e0f8b530a4e5cb5569598f8cf60270fd29ccede5 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 07:06:46 -0400
Subject: [PATCH 043/116] consider additional edge cases during preprocessing.

---
 .../granite_speech/processing_granite_speech.py      | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index e144ddcbd772..b10125c5df06 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -120,11 +120,23 @@ def _get_validated_audios(self, audios):
             audios = [torch.from_numpy(arr) for arr in audios]
 
         if isinstance(audios, torch.Tensor):
+            if audios.ndim == 1:
+                audios = audios.unsqueeze(0)
+            if not torch.is_floating_point(audios):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1.")
+            
+            if audios.shape[0] > 1:
+                logger.warning("Audio samples are alrady collated, we'll assume they all have the same length")
             lengths = [audios.shape[-1]] * audios.shape[0]
             return audios, lengths
+        
         elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
+            if not torch.is_floating_point(audios[0]):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1.")
             lengths = [audio.shape[-1] for audio in audios]
             padding = [max(lengths) - length for length in lengths]
+            # ensure all audios have a batch dimension:
+            audios = [audio.view(1, -1) for audio in audios]
             padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
             audios = torch.cat(padded, dim=0)
             return audios, lengths

From 085e0fa8a6fa98dd7d9c3447ff30c0b353914200 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 09:30:22 -0400
Subject: [PATCH 044/116] consider additional edge cases during preprocessing.

---
 .../models/granite_speech/processing_granite_speech.py          | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index b10125c5df06..4e11351e0d94 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -62,6 +62,8 @@ def __call__(
         
         if audios is not None:
             audios, audio_lengths = self._get_validated_audios(audios)
+            if any(t.count(self.audio_token) != 1 for t in text):
+                raise ValueError("We're supporting a single audio per input")
             if len(audio_lengths) != expected_num_audios:
                 raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
             

From 9c1eac392960a5e8a5cdcd1c111492bff957f650 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 23 Mar 2025 09:32:27 -0400
Subject: [PATCH 045/116] add features mask for batched inference (bugfix)

---
 .../models/granite_speech/modeling_granite_speech.py        | 6 ++++--
 .../models/granite_speech/processing_granite_speech.py      | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 8263601536fe..8f9030827444 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1182,6 +1182,7 @@ def forward(
         self,
         input_ids: torch.LongTensor = None,
         input_features: torch.FloatTensor = None,
+        input_features_mask: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1248,6 +1249,7 @@ def forward(
             inputs_embeds = self.get_merged_audio_embeddings(
                 input_ids=input_ids,
                 audio_features=audio_features,
+                input_features_mask=input_features_mask
             )
 
         outputs = self.language_model(
@@ -1327,7 +1329,7 @@ def prepare_inputs_for_generation(
             model_inputs["input_features"] = input_features
         return model_inputs
 
-    def get_merged_audio_embeddings(self, input_ids, audio_features):
+    def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_mask):
         """
         Adds the audio token to the model's LLM vocabulary so that we can pass it
         through the tokenizer; it's assumed that the embeddings corresponding to the
@@ -1344,7 +1346,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features):
 
         # Mask the audio features into the text embeddings
         special_audio_mask = is_audio_index.unsqueeze(-1)
-        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)[input_features_mask]
         inputs_embeds = inputs_embeds.masked_scatter(
             special_audio_mask,
             audio_features,
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 4e11351e0d94..2afc7b8eba21 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -75,7 +75,9 @@ def __call__(
             num_audio_features = self.feature_extractor._get_num_audio_features(
                 audio_lengths
             )
-
+            speech_inputs["input_features_mask"] = torch.arange(max(num_audio_features)).view(1, -1) <= \
+                torch.tensor(num_audio_features).view(-1, 1)
+            
             # duplicate the audio placeholders to match the feature dims
             text = self._expand_audio_placeholders(text, num_audio_features)
         else:

From 633e8e75a78f6a4c43ca7d1588f02684e0c977cf Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 11:26:39 +0000
Subject: [PATCH 046/116] Minor refactor, remove multiaudio processor tests

---
 .../granite_speech/processing_granite_speech.py    | 14 +++++++-------
 .../test_processor_granite_speech.py               | 10 ++++++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 2afc7b8eba21..6e3a2a6e4b91 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -62,8 +62,8 @@ def __call__(
         
         if audios is not None:
             audios, audio_lengths = self._get_validated_audios(audios)
-            if any(t.count(self.audio_token) != 1 for t in text):
-                raise ValueError("We're supporting a single audio per input")
+            if any(text.count(self.audio_token) != 1 for text in text):
+                raise ValueError("Only one audio sample is currently supported per input")
             if len(audio_lengths) != expected_num_audios:
                 raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
             
@@ -81,7 +81,7 @@ def __call__(
             # duplicate the audio placeholders to match the feature dims
             text = self._expand_audio_placeholders(text, num_audio_features)
         else:
-            assert expected_num_audios == 0, "no audio is provided, expecting no audio tokens."
+            assert expected_num_audios == 0, "No audio is provided, expecting no audio tokens"
         
         text_inputs = self.tokenizer(text, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **speech_inputs})
@@ -127,16 +127,16 @@ def _get_validated_audios(self, audios):
             if audios.ndim == 1:
                 audios = audios.unsqueeze(0)
             if not torch.is_floating_point(audios):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1.")
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
             
             if audios.shape[0] > 1:
-                logger.warning("Audio samples are alrady collated, we'll assume they all have the same length")
+                logger.warning("Audio samples are already collated; assuming they all have the same length")
             lengths = [audios.shape[-1]] * audios.shape[0]
             return audios, lengths
         
         elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
             if not torch.is_floating_point(audios[0]):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1.")
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
             lengths = [audio.shape[-1] for audio in audios]
             padding = [max(lengths) - length for length in lengths]
             # ensure all audios have a batch dimension:
@@ -145,7 +145,7 @@ def _get_validated_audios(self, audios):
             audios = torch.cat(padded, dim=0)
             return audios, lengths
         
-        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays.")
+        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
 
 
 __all__ = ["GraniteSpeechProcessor"]
\ No newline at end of file
diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index aee11ab3c59c..d97d5a178c39 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -129,14 +129,14 @@ def test_bad_audios_fails(self):
 
     @parameterized.expand([
         ([1, 269920], [171], torch.rand),
-        ([2, 269920], [171, 171], torch.rand),
         ([1, 269920], [171], np.random.rand),
-        ([2, 269920], [171, 171], np.random.rand),
     ])
     def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features, random_func):
         """Ensure audio token filling is handled correctly when we have
         one or more audio inputs whose features are all the same length
         stacked into a tensor / numpy array.
+
+        NOTE: Currently we enforce that each sample can only have one audio.
         """
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()
@@ -178,9 +178,11 @@ def test_audio_token_filling_varying_len_feature_list(self):
         num_expected_features = [90, 171]
         audios = [torch.rand(dims) - .5 for dims in vec_dims]
 
-        audio_tokens = processor.audio_token * len(vec_dims)
         inputs = processor(
-            text=f"{audio_tokens} Can you compare this audio?",
+            text=[
+                f"{processor.audio_token} Can you describe this audio?",
+                f"{processor.audio_token} How does it compare with this audio?",
+            ],
             audios=audios,
             return_tensors="pt"
         )

From d40175f2c3b3f48cd3aa1670c888c6b5917549ef Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 05:52:21 +0000
Subject: [PATCH 047/116] Add set input/output embeddings for granite speech

---
 .../models/granite_speech/modeling_granite_speech.py        | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 8f9030827444..ad5089e5d3b1 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1165,6 +1165,12 @@ def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True)
 
         self.post_init()
 
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
 

From 672c5ccea8407c51a0b92ef6c5df9853748f48bd Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 05:53:13 +0000
Subject: [PATCH 048/116] Fix feature dim check in processor test

---
 tests/models/granite_speech/test_processor_granite_speech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index d97d5a178c39..a5078ef5fa4a 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -161,8 +161,8 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
             [vec_dims[1] for _ in range(vec_dims[0])],
         )
         num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
+        assert list(inputs["input_features"].shape) == [vec_dims[0], 844, 160]
         assert sum(num_computed_features) == num_audio_tokens
-        assert num_expected_features == num_expected_features
 
     def test_audio_token_filling_varying_len_feature_list(self):
         """Ensure audio token filling is handled correctly when we have

From 56f2e6e044c2b83582f21aceee639b6a52783cde Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 10:48:56 +0000
Subject: [PATCH 049/116] Pop input features in embed test for granite speech

---
 tests/generation/test_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 68539b71e16a..384c47fef47e 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1659,6 +1659,12 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
                 inputs_dict.pop("pixel_values", None)
                 inputs_dict.pop("pixel_values_videos", None)
                 inputs_dict.pop("pixel_values_images", None)
+            # HACK - in the case of granite speech, input_features and inputs_embeds are mutually exclusive;
+            # this is similar to VLMs and should likely be standardized for similar audio models in the future,
+            # then made generic here.
+            if "granitespeech" in model_class.__name__.lower():
+                inputs_dict.pop("input_features", None)
+
             #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
             has_complex_embeds_computation = any(
                 model_name in model_class.__name__.lower() for model_name in ["moshi"]

From 22a9d242b4ab18558864078ab032542b4661cd6f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 10:49:13 +0000
Subject: [PATCH 050/116] Small fixes for test edge cases

Add granite speech to seq2seq causal lm mapping names
---
 src/transformers/models/auto/modeling_auto.py             | 1 +
 .../models/granite_speech/modeling_granite_speech.py      | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1c6828a48396..3f2fb425932d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -973,6 +973,7 @@
         ("encoder-decoder", "EncoderDecoderModel"),
         ("fsmt", "FSMTForConditionalGeneration"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
+        ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("led", "LEDForConditionalGeneration"),
         ("longt5", "LongT5ForConditionalGeneration"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index ad5089e5d3b1..81dddb886329 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1051,6 +1051,8 @@ def forward(self, x, context_size):
 class GraniteSpeechPretrainedModel(PreTrainedModel):
     config_class = GraniteSpeechConfig
     _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -1269,7 +1271,6 @@ def forward(
             return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
-            labels=labels,
             **lm_kwargs,
         )
         logits = outputs[0]
@@ -1359,18 +1360,19 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         )
         return inputs_embeds
 
-    def generate(self, input_features=None, **kwargs):
+    def generate(self, *args, **kwargs):
         """This model is expected to have a lora adapater, which is only
         enabled when considering audio inputs. As such, we override generate
         to conditionally enable / disable the lora adapter based on whether
         or not any input features were provided.
         """
+        input_features = kwargs.pop("input_features", None)
         if is_peft_available and self._hf_peft_config_loaded:
             if input_features is not None:
                 self.enable_adapters()
             else:
                 self.disable_adapters()
-        return super().generate(input_features=input_features, **kwargs)
+        return super().generate(*args, input_features=input_features, **kwargs)
 
 __all__ = [
     "GraniteSpeechForConditionalGeneration",

From c13d3ed285462273affefdfdd511ce4f093599cc Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 11:04:05 +0000
Subject: [PATCH 051/116] Add small tests for granite speech model

---
 tests/models/granite_speech/__init__.py       |   0
 .../test_modeling_granite_speech.py           | 295 ++++++++++++++++++
 2 files changed, 295 insertions(+)
 create mode 100644 tests/models/granite_speech/__init__.py
 create mode 100644 tests/models/granite_speech/test_modeling_granite_speech.py

diff --git a/tests/models/granite_speech/__init__.py b/tests/models/granite_speech/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
new file mode 100644
index 000000000000..c37007d6ccdd
--- /dev/null
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -0,0 +1,295 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the IBM Granite Speech model."""
+import unittest
+from ...generation.test_utils import GenerationTesterMixin
+import tempfile
+from transformers import (
+    GraniteSpeechConfig,
+    GraniteSpeechForConditionalGeneration,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    require_torch,
+    torch_device,
+    require_torch_sdpa,
+)
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    _config_zero_init,
+    torch_device,
+)
+from ...test_configuration_common import ConfigTester
+
+
+if is_torch_available():
+    import torch
+
+class GraniteSpeechForConditionalGenerationModelTester:
+    def __init__(
+        self,
+        parent,
+        seq_length=7,
+        encoder_config={
+            "model_type": "granite_speech_encoder",
+            "context_size": 200,
+            "conv_expansion_factor": 2,
+            "conv_kernel_size": 15,
+            "dim_head": 32,
+            "dropout": 0.1,
+            "feedforward_mult": 4,
+            "hidden_dim": 32,
+            "input_dim": 160,
+            "num_heads": 4,
+            "num_layers": 2,
+            "output_dim": 42
+        },
+        # NOTE - this is pretty much copied from llavanext since llama is identical in configs except for multipliers
+        text_config={
+            "model_type": "granite",
+            "is_training": True,
+            "seq_length": 7,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 580,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 1,
+        },
+        projector_config={
+            "attention_probs_dropout_prob": 0.1,
+            "cross_attention_frequency": 1,
+            "downsample_rate": 5,
+            "encoder_hidden_size": 32,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 32,
+            "initializer_range": 0.02,
+            "intermediate_size": 256,
+            "layer_norm_eps": 1e-12,
+            "llm_dim": 32,
+            "max_position_embeddings": 2048,
+            "model_type": "blip_2_qformer",
+            "num_attention_heads": 4,
+            "num_hidden_layers": 2,
+            "position_embedding_type": "absolute",
+            "use_qformer_text_input": False,
+            "vocab_size": 30522,
+            "window_size": 15
+        },
+        audio_token_index=0,
+        tie_word_embeddings=True,
+        initializer_range=0.02,
+        has_lora_adapter=True,
+        is_training=True,
+    ):
+        self.parent = parent
+        self.projector_config = None
+        self.encoder_config = encoder_config
+        self.text_config = text_config
+        self.projector_config = projector_config
+        self.audio_token_index = audio_token_index
+        self.tie_word_embeddings = tie_word_embeddings
+        self.initializer_range = initializer_range
+        self.has_lora_adapater = has_lora_adapter
+        self.is_training = is_training
+ 
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.hidden_size = text_config["hidden_size"]
+        self.batch_size = 3
+        self.pad_token_id = text_config["pad_token_id"]
+        self.seq_len = 7
+        self.num_audio_tokens = 2
+        self.seq_length = seq_length + self.num_audio_tokens
+
+
+    def get_config(self):
+        return GraniteSpeechConfig(
+            encoder_config=self.encoder_config,
+            text_config=self.text_config,
+            projector_config=self.projector_config,
+            audio_token_index=self.audio_token_index,
+            tie_word_embeddings=self.tie_word_embeddings,
+            initializer_range=self.initializer_range,
+            has_lora_adapter=self.has_lora_adapater,
+        )
+
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor(
+            # TODO - clean this up
+            [self.batch_size, 844, 160],
+        )
+        config = self.get_config()
+        return config, input_features
+
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_features = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
+        input_ids[input_ids == config.audio_token_index] = self.pad_token_id
+
+        input_ids[:, : self.num_audio_tokens] = config.audio_token_index
+
+        inputs_dict = {
+            "input_features": input_features,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+    def create_and_check_granite_speech_model_fp16_forward(
+        self, config, input_ids, input_features, attention_mask
+    ):
+        model = GraniteSpeechForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        logits = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            input_features=input_features,
+            return_dict=True,
+        )["logits"]
+        self.parent.assertFalse(torch.isnan(logits).any().item())
+
+    def create_and_check_granite_speech_model_fp16_autocast_forward(
+        self, config, input_ids, input_features, attention_mask,
+    ):
+        config.torch_dtype = torch.float16
+        model = GraniteSpeechForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            logits = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                input_features=input_features.to(torch.bfloat16),
+                return_dict=True,
+            )["logits"]
+        self.parent.assertFalse(torch.isnan(logits).any().item())
+
+
+
+@require_torch
+class GraniteSpeechForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `GraniteSpeechForConditionalGeneration`.
+    """
+
+    all_model_classes = (GraniteSpeechForConditionalGeneration,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = GraniteSpeechForConditionalGenerationModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=GraniteSpeechConfig,
+            has_text_modality=False,
+        )
+
+    def test_inputs_embeds(self):
+        # overwrite inputs_embeds tests because we need to delete "input features" for the audio model
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["input_features"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    def test_initialization(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if name == "projector.query":
+                    continue
+                elif param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        # overwrite because Granite Speech is audio+text model (not vision+text)
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        if not self._is_composite:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        for model_class in self.all_model_classes:
+            # NOTE - currently we only enable alternate attention implementations on
+            # the encapsulated LLM; in the future, this should be added for the conformer
+            # encoder as well.
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
+
+                # `None` as it is the requested one which will be assigned to each sub-config
+                # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+                self.assertTrue(model.language_model.config._attn_implementation == text_attn)
+
+                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
+                model_eager = model_eager.eval().to(torch_device)
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
+
+                for name, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")

From 7321c129cb55c57e78d02304e530667c050c888c Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 12:47:06 +0000
Subject: [PATCH 052/116] Fix data parallelism test

---
 .../granite_speech/modeling_granite_speech.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 81dddb886329..23f294bbd0d4 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -781,9 +781,9 @@ class CTCModel(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super(CTCModel, self).__init__()
 
-        self.rnn_trL = [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
-        for l in range(config.num_layers):
-            self.rnn_trL.append(
+        self.rnn_tr = nn.ModuleList(
+            [nn.Linear(config.input_dim, config.hidden_dim, bias=True)] + \
+            [
                 ConformerBlock(
                     dim=config.hidden_dim,
                     dim_head=config.dim_head,
@@ -796,9 +796,9 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
                     ff_dropout=config.dropout,
                     conv_dropout=config.dropout,
                     use_max_pos_emb_in_pos_emb_calc=config.use_max_pos_emb_in_pos_emb_calc,
-                )
-            )
-            self.rnn_tr = nn.Sequential(*self.rnn_trL)
+                ) for layer_idx in range(config.num_layers)
+            ]
+        )
 
         self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
         self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
@@ -809,10 +809,10 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.output_dim = config.output_dim
 
     def forward(self, x: torch.Tensor):
-        x = self.rnn_trL[0](x)
-        for l in range(1, self.num_layers + 1):
-            x = self.rnn_trL[l](x, self.context_size)
-            if l == self.num_layers // 2:
+        x = self.rnn_tr[0](x)
+        for idx, layer in enumerate(self.rnn_tr[1:], start=1):
+            x = layer(x, self.context_size)
+            if idx == self.num_layers // 2:
                 x_mid = x.clone()
                 x_mid = self.out(x_mid)
                 x += self.out_mid(nn.Softmax(dim=-1)(x_mid))

From 5738ad1aeed0ef8b6880315f0fef538206e2a56d Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 12:54:30 +0000
Subject: [PATCH 053/116] Standardize model class names

---
 .../granite_speech/modeling_granite_speech.py | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 23f294bbd0d4..01829feeec7c 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -740,7 +740,7 @@ def forward(
 # transformers.models.X, delete all of the code above, and
 # create the model through AutoModel.
 
-class EncoderProjectorQFormer(nn.Module):
+class GraniteSpeechEncoderProjectorQFormer(nn.Module):
     def __init__(self, config: GraniteSpeechProjectorConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -777,14 +777,14 @@ def forward(self, x, atts):
         return query_proj
 
 ### Encoder
-class CTCModel(nn.Module):
+class GraniteSpeechCTCModel(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
-        super(CTCModel, self).__init__()
+        super(GraniteSpeechCTCModel, self).__init__()
 
         self.rnn_tr = nn.ModuleList(
             [nn.Linear(config.input_dim, config.hidden_dim, bias=True)] + \
             [
-                ConformerBlock(
+                GraniteSpeechConformerBlock(
                     dim=config.hidden_dim,
                     dim_head=config.dim_head,
                     heads=config.num_heads,
@@ -820,7 +820,7 @@ def forward(self, x: torch.Tensor):
 
 
 # NOTE: Conformer adapated from: https://github.com/lucidrains/conformer.git
-class Permute(nn.Module):
+class GraniteSpeechConformerPermute(nn.Module):
     def __init__(self, dims):
         super().__init__()
         self.dims = dims
@@ -830,7 +830,7 @@ def forward(self, x):
         return x
 
 
-class DepthWiseConv1d(nn.Module):
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     def __init__(self, chan_in, chan_out, kernel_size, padding):
         super().__init__()
         self.padding = padding
@@ -841,7 +841,7 @@ def forward(self, x):
         return self.conv(x)
 
 
-class Scale(nn.Module):
+class GraniteSpeechConformerScale(nn.Module):
     def __init__(self, scale, fn):
         super().__init__()
         self.fn = fn
@@ -851,7 +851,7 @@ def forward(self, x, **kwargs):
         return self.fn(x, **kwargs) * self.scale
 
 
-class PreNorm(nn.Module):
+class GraniteSpeechConformerPreNorm(nn.Module):
     def __init__(self, dim, fn):
         super().__init__()
         self.fn = fn
@@ -862,7 +862,7 @@ def forward(self, x, **kwargs):
         return self.fn(x, **kwargs)
 
 
-class PreNormAttn(nn.Module):
+class GraniteSpeechConformerPreNormAttn(nn.Module):
     def __init__(self, dim, fn):
         super().__init__()
         self.fn = fn
@@ -873,7 +873,7 @@ def forward(self, x, context_size, **kwargs):
         return self.fn(x, context_size, **kwargs)
 
 
-class Attention(nn.Module):
+class GraniteSpeechConformerAttention(nn.Module):
     def __init__(
         self,
         dim,
@@ -940,7 +940,7 @@ def forward(self, x, context_size):
         return self.dropout(out)
 
 
-class FeedForward(nn.Module):
+class GraniteSpeechConformerFeedForward(nn.Module):
     def __init__(
         self,
         dim,
@@ -960,7 +960,7 @@ def forward(self, x):
         return self.net(x)
 
 
-class ConformerConvModule(nn.Module):
+class GraniteSpeechConformerConvModule(nn.Module):
     def __init__(
         self,
         dim,
@@ -975,17 +975,17 @@ def __init__(
 
         self.net = nn.Sequential(
             nn.LayerNorm(dim),
-            Permute(dims=(0, 2, 1)),
+            GraniteSpeechConformerPermute(dims=(0, 2, 1)),
             nn.Conv1d(dim, inner_dim * 2, 1),
             nn.GLU(dim=1),
-            DepthWiseConv1d(inner_dim,
+            GraniteSpeechConformerDepthWiseConv1d(inner_dim,
                             inner_dim,
                             kernel_size=kernel_size,
                             padding=padding),
             nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
             nn.SiLU(),
             nn.Conv1d(inner_dim, dim, 1),
-            Permute(dims=(0, 2, 1)),
+            GraniteSpeechConformerPermute(dims=(0, 2, 1)),
             nn.Dropout(dropout)
         )
 
@@ -998,7 +998,7 @@ def calc_same_padding(kernel_size: int):
         return (pad, pad - (kernel_size + 1) % 2)
 
 
-class ConformerBlock(nn.Module):
+class GraniteSpeechConformerBlock(nn.Module):
     def __init__(
         self,
         *,
@@ -1015,8 +1015,8 @@ def __init__(
         use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
-        self.ff1 = FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
-        self.attn = Attention(
+        self.ff1 = GraniteSpeechConformerFeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+        self.attn = GraniteSpeechConformerAttention(
             dim=dim,
             dim_head=dim_head,
             heads=heads,
@@ -1024,18 +1024,18 @@ def __init__(
             context_size=context_size,
             use_max_pos_emb_in_pos_emb_calc=use_max_pos_emb_in_pos_emb_calc,
         )
-        self.conv = ConformerConvModule(
+        self.conv = GraniteSpeechConformerConvModule(
             dim=dim,
             causal=False,
             expansion_factor=conv_expansion_factor,
             kernel_size=conv_kernel_size,
             dropout=conv_dropout,
         )
-        self.ff2 = FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+        self.ff2 = GraniteSpeechConformerFeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
 
-        self.attn = PreNormAttn(dim, self.attn)
-        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
-        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
+        self.attn = GraniteSpeechConformerPreNormAttn(dim, self.attn)
+        self.ff1 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(dim, self.ff1))
+        self.ff2 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(dim, self.ff2))
 
         self.post_norm = nn.LayerNorm(dim)
 
@@ -1154,8 +1154,8 @@ def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True)
         if self.language_model._tied_weights_keys is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
-        self.encoder = CTCModel(config.encoder_config)
-        self.projector = EncoderProjectorQFormer(config.projector_config)
+        self.encoder = GraniteSpeechCTCModel(config.encoder_config)
+        self.projector = GraniteSpeechEncoderProjectorQFormer(config.projector_config)
 
         if config.has_lora_adapter and not is_peft_available():
             logger.warning(

From bead7841a43c7c472efc1291b594a5f4c3c30413 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 13:12:28 +0000
Subject: [PATCH 054/116] Fix check for copies

---
 .../granite_speech/modeling_granite_speech.py    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 01829feeec7c..ccb222d2e38d 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -72,7 +72,7 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
 # which will should involve pulling it out into its own dir so that it is accessible
 # under transformers.models.X.
 
-# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerMultiHeadAttention with Blip2->GraniteSpeech
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->GraniteSpeech
 class GraniteSpeechQFormerMultiHeadAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
@@ -219,7 +219,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
-# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerAttention with Blip2->GraniteSpeech
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->GraniteSpeech
 class GraniteSpeechQFormerAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
@@ -312,7 +312,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
-# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerLayer with Blip2->GraniteSpeech
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerLayer with Blip2->GraniteSpeech
 class GraniteSpeechQFormerLayer(nn.Module):
     def __init__(self, config, layer_idx):
         super().__init__()
@@ -416,7 +416,7 @@ def feed_forward_chunk_query(self, attention_output):
         layer_output = self.output_query(intermediate_output, attention_output)
         return layer_output
 
-# Copied from transformers.models.blip_2.modeling_blip2.Blip2QFormerEncoder with Blip2->GraniteSpeech
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->GraniteSpeech
 class GraniteSpeechQFormerEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -511,7 +511,7 @@ def forward(
             cross_attentions=all_cross_attentions,
         )
 
-# Copied from transformers.models.blip.modeling_blip.Blip2PreTrainedModel with Blip2->GraniteSpeechEncoderProjector
+
 class GraniteSpeechEncoderProjectorPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -545,7 +545,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
 
-# Copied from transformers.models.blip.modeling_blip.GraniteSpeechQFormerModel with GraniteSpeech->GraniteSpeech
 class GraniteSpeechQFormerModel(GraniteSpeechEncoderProjectorPreTrainedModel):
     """
     Querying Transformer (Q-Former), used in GraniteSpeech.
@@ -562,12 +561,15 @@ def __init__(self, config: GraniteSpeechProjectorConfig):
 
         self.post_init()
 
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.get_input_embeddings
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.set_input_embeddings
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel._prune_heads
     def _prune_heads(self, heads_to_prune):
         """
         Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
@@ -576,6 +578,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.get_extended_attention_mask
     def get_extended_attention_mask(
         self,
         attention_mask: torch.Tensor,
@@ -621,6 +624,7 @@ def get_extended_attention_mask(
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
 
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.forward
     def forward(
         self,
         query_embeds: torch.FloatTensor,

From 49611910ce861e1b2a5bacdf53f2804f4109a04e Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 16:51:44 +0000
Subject: [PATCH 055/116] Fix misaligned init check

---
 src/transformers/models/granite_speech/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index 11c4d41252ad..eb1519f451cd 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -37,6 +37,7 @@
 else:
     _import_structure["modeling_granite_speech"] = [
         "GraniteSpeechForConditionalGeneration",
+        "GraniteSpeechPretrainedModel",
     ]
 
 try:

From 67197049e382b5dcb586546467254726c9e73c93 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 16:52:02 +0000
Subject: [PATCH 056/116] Skip granite speech in checkpoint check

---
 utils/check_config_docstrings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index a22abd238810..7de82aff81af 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -48,6 +48,7 @@
     "GraniteConfig",
     "GraniteMoeConfig",
     "Qwen3MoeConfig",
+    "GraniteSpeechConfig",
 }
 
 

From 938becea02e7013c7e41d543dab7d290145d65c3 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 16:52:20 +0000
Subject: [PATCH 057/116] Use default for tie_word_embeddings in granite speech

---
 .../models/granite_speech/configuration_granite_speech.py      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index afdb2d1d67af..39eae648c4cb 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -94,7 +94,6 @@ def __init__(
         text_config=None,
         projector_config=None,
         audio_token_index=49155,
-        tie_word_embeddings=True,
         initializer_range=0.02,
         has_lora_adapter=True,
         **kwargs,
@@ -123,6 +122,6 @@ def __init__(
         self.audio_token_index = audio_token_index
         self.initializer_range = initializer_range
         self.has_lora_adapter = has_lora_adapter
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(**kwargs)
 
 __all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file

From d6145dd475fe7a6547c52f150079d4cbb2ed7242 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 17:07:46 +0000
Subject: [PATCH 058/116] Fix non documentation granite speech repo issues

---
 src/transformers/models/granite_speech/__init__.py     |  8 ++++++--
 .../models/granite_speech/modeling_granite_speech.py   | 10 ++++++----
 utils/check_repo.py                                    |  2 ++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index eb1519f451cd..ce5be5b97059 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -37,7 +37,9 @@
 else:
     _import_structure["modeling_granite_speech"] = [
         "GraniteSpeechForConditionalGeneration",
-        "GraniteSpeechPretrainedModel",
+        "GraniteSpeechPreTrainedModel",
+        "GraniteSpeechEncoderProjectorPreTrainedModel",
+        "GraniteSpeechQFormerModel",
     ]
 
 try:
@@ -66,7 +68,9 @@
     else:
         from .modeling_granite_speech import (
             GraniteSpeechForConditionalGeneration,
-            GraniteSpeechPretrainedModel,
+            GraniteSpeechPreTrainedModel,
+            GraniteSpeechEncoderProjectorPreTrainedModel,
+            GraniteSpeechQFormerModel,
         )
 
     try:
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index ccb222d2e38d..c4f1c63cc62b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1052,7 +1052,7 @@ def forward(self, x, context_size):
         return x
 
 
-class GraniteSpeechPretrainedModel(PreTrainedModel):
+class GraniteSpeechPreTrainedModel(PreTrainedModel):
     config_class = GraniteSpeechConfig
     _supports_cache_class = True
     _supports_flash_attn_2 = True
@@ -1146,8 +1146,8 @@ def _init_weights(self, module):
     """The Granite Speech model, which consists of an audio encoder, projector, and language model.""",
     GRANITE_SPEECH_START_DOCSTRING,
 )
-class GraniteSpeechForConditionalGeneration(GraniteSpeechPretrainedModel, GenerationMixin):
-    def __init__(self, config: GraniteSpeechConfig, is_legacy=False, skip_lora=True):
+class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, GenerationMixin):
+    def __init__(self, config: GraniteSpeechConfig):
         super().__init__(config)
         # NOTE: It doesn't matter when we initialize from config, but we should be careful
         # to make sure this does not pick up the adapter_config if in the future we use
@@ -1380,5 +1380,7 @@ def generate(self, *args, **kwargs):
 
 __all__ = [
     "GraniteSpeechForConditionalGeneration",
-    "GraniteSpeechPretrainedModel",
+    "GraniteSpeechPreTrainedModel",
+    "GraniteSpeechEncoderProjectorPreTrainedModel",
+    "GraniteSpeechQFormerModel",
 ]
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 4dcfadefc9e2..a34a6ca4e6cc 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -110,6 +110,7 @@
         "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
         "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
         "GraphormerDecoderHead",  # Building part of bigger (tested) model.
+        "GraniteSpeechQFormerModel", # Building part of bigger (tested) model.
         "JukeboxVQVAE",  # Building part of bigger (tested) model.
         "JukeboxPrior",  # Building part of bigger (tested) model.
         "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
@@ -195,6 +196,7 @@
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",
+    "GraniteSpeechQFormerModel",
     "BlipForImageTextRetrieval",
     "BlipForQuestionAnswering",
     "BlipVisionModel",

From 7e816b44c12bbbda8e0d91776ebd63c1a89313eb Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 17:42:13 +0000
Subject: [PATCH 059/116] Fix comments and docstring checks

---
 .../granite_speech/modeling_granite_speech.py | 35 +++++++++++++++----
 .../test_modeling_granite_speech.py           |  1 -
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index c4f1c63cc62b..395733e076c4 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1052,6 +1052,26 @@ def forward(self, x, context_size):
         return x
 
 
+
+GRANITE_SPEECH_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config (`GraniteSpeechConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Granite Speech Model outputting raw hidden-states without any specific head on top.",
+    GRANITE_SPEECH_START_DOCSTRING,
+)
 class GraniteSpeechPreTrainedModel(PreTrainedModel):
     config_class = GraniteSpeechConfig
     _supports_cache_class = True
@@ -1070,7 +1090,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
-GRANITE_SPEECH_START_DOCSTRING = r"""
+GRANITE_SPEECH_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1080,10 +1100,13 @@ def _init_weights(self, module):
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        input_features (`torch.FloatTensor` of shape `(batch_size, #TODO, #TODO)): 
-            The tensors corresponding to the input images. input features can be obtained using
-            [`AutoImageProcessor`]. See [`GraniteSpeechFeatureExtractor.__call__`] for details.
+        input_features (`torch.FloatTensor` of shape `(batch_size, audio seq len, mel feat dim)):
+            The tensors corresponding to the input audios. input features can be obtained using
+            [`AutoFeatureExtractor`]. See [`GraniteSpeechFeatureExtractor.__call__`] for details.
             [`GraniteSpeechProcessor`] uses [`GraniteSpeechFeatureExtractor`] for processing audio.
+        input_mask (`torch.Tensor`, *optional*)
+            Mask for extracted audio features that should should be ignored when creating the merged
+            multimodal representation (i.e., due to padding).
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -1122,8 +1145,6 @@ def _init_weights(self, module):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
             model's internal embedding lookup matrix.
-
-
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -1188,7 +1209,7 @@ def get_audio_features(self, input_features):
         projected_embeds = self.projector(encoder_embeds, None)
         return projected_embeds
 
-    @add_start_docstrings_to_model_forward(GRANITE_SPEECH_START_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GRANITE_SPEECH_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GraniteSpeechCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index c37007d6ccdd..65c700bbbd12 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -58,7 +58,6 @@ def __init__(
             "num_layers": 2,
             "output_dim": 42
         },
-        # NOTE - this is pretty much copied from llavanext since llama is identical in configs except for multipliers
         text_config={
             "model_type": "granite",
             "is_training": True,

From 4f4659f79057eb2935fadbce6b6c3174be0a0536 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 17:57:09 +0000
Subject: [PATCH 060/116] Add placeholder docs for granite speech

---
 docs/source/en/_toctree.yml                |  2 +
 docs/source/en/model_doc/granite_speech.md | 60 ++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 docs/source/en/model_doc/granite_speech.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1a41368392fd..0541d5f24cc1 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -489,6 +489,8 @@
         title: GraniteMoe
       - local: model_doc/granitemoeshared
         title: GraniteMoeShared
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
       - local: model_doc/granitevision
         title: GraniteVision
       - local: model_doc/helium
diff --git a/docs/source/en/model_doc/granite_speech.md b/docs/source/en/model_doc/granite_speech.md
new file mode 100644
index 000000000000..ae5f85f18276
--- /dev/null
+++ b/docs/source/en/model_doc/granite_speech.md
@@ -0,0 +1,60 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Granite Speech
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+Currently being updated!
+
+## GraniteSpeechConfig
+
+[[autodoc]] GraniteSpeechConfig
+
+
+## GraniteSpeechEncoderConfig
+
+[[autodoc]] GraniteSpeechEncoderConfig
+
+
+## GraniteSpeechProjectorConfig
+
+[[autodoc]] GraniteSpeechProjectorConfig
+
+
+## GraniteSpeechProcessor
+
+[[autodoc]] GraniteSpeechProcessor
+
+
+## GraniteSpeechFeatureExtractor
+
+[[autodoc]] GraniteSpeechFeatureExtractor
+
+
+## GraniteSpeechQFormerModel
+
+[[autodoc]] GraniteSpeechQFormerModel
+    - forward
+
+
+## GraniteSpeechForConditionalGeneration
+
+[[autodoc]] GraniteSpeechForConditionalGeneration
+    - forward

From 7b8cd96d4c23f890e8554ecc7b4f16ba3edaa710 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 19:05:32 +0000
Subject: [PATCH 061/116] Fix test naming collision

---
 tests/models/granite_speech/test_processor_granite_speech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index a5078ef5fa4a..29e44c3d3d01 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -115,7 +115,7 @@ def test_bad_audios_fails(self):
         with pytest.raises(TypeError):
             processor(text=None, audios="foo")
 
-    def test_bad_audios_fails(self):
+    def test_nested_bad_audios_fails(self):
         """Ensure we gracefully fail if audio is the wrong nested type."""
         tokenizer = self.get_tokenizer()
         feature_extractor = self.get_feature_extractor()

From 92eb14417bd8270d905c457a48b2fb9236e54df0 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 19:17:54 +0000
Subject: [PATCH 062/116] Code formatting

---
 .../models/auto/configuration_auto.py         |   4 +-
 .../models/granite_speech/__init__.py         |   5 +-
 .../configuration_granite_speech.py           |   9 +-
 .../feature_extraction_granite_speech.py      |  29 ++--
 .../granite_speech/modeling_granite_speech.py | 143 ++++++++----------
 .../processing_granite_speech.py              |  31 ++--
 .../test_modeling_granite_speech.py           |  42 ++---
 .../test_processor_granite_speech.py          |  38 ++---
 utils/check_repo.py                           |   2 +-
 9 files changed, 144 insertions(+), 159 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6d5677e833dd..0d6e02606634 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -142,10 +142,10 @@
         ("gptj", "GPTJConfig"),
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("granite", "GraniteConfig"),
+        ("granite_speech", "GraniteSpeechConfig"),
         ("granitemoe", "GraniteMoeConfig"),
         ("granitemoeshared", "GraniteMoeSharedConfig"),
         ("granitevision", "LlavaNextConfig"),
-        ("granite_speech", "GraniteSpeechConfig"),
         ("graphormer", "GraphormerConfig"),
         ("grounding-dino", "GroundingDinoConfig"),
         ("groupvit", "GroupViTConfig"),
@@ -492,10 +492,10 @@
         ("gptj", "GPT-J"),
         ("gptsan-japanese", "GPTSAN-japanese"),
         ("granite", "Granite"),
+        ("granite_speech", "GraniteSpeech"),
         ("granitemoe", "GraniteMoeMoe"),
         ("granitemoeshared", "GraniteMoeSharedMoe"),
         ("granitevision", "LLaVA-NeXT"),
-        ("granite_speech", "GraniteSpeech"),
         ("graphormer", "Graphormer"),
         ("grounding-dino", "Grounding DINO"),
         ("groupvit", "GroupViT"),
diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index ce5be5b97059..e44510c3e43f 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -21,6 +21,7 @@
 )
 from ...utils.import_utils import define_import_structure
 
+
 _import_structure = {
     "configuration_granite_speech": [
         "GraniteSpeechConfig",
@@ -52,7 +53,6 @@
     _import_structure["processing_granite_speech"] = ["GraniteSpeechProcessor"]
 
 
-
 if TYPE_CHECKING:
     from .configuration_granite_speech import (
         GraniteSpeechConfig,
@@ -67,9 +67,9 @@
         pass
     else:
         from .modeling_granite_speech import (
+            GraniteSpeechEncoderProjectorPreTrainedModel,
             GraniteSpeechForConditionalGeneration,
             GraniteSpeechPreTrainedModel,
-            GraniteSpeechEncoderProjectorPreTrainedModel,
             GraniteSpeechQFormerModel,
         )
 
@@ -83,5 +83,6 @@
         from .processing_granite_speech import GraniteSpeechProcessor
 else:
     import sys
+
     _file = globals()["__file__"]
     sys.modules[__name__] = _LazyModule(__name__, _file, _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 39eae648c4cb..1fc6a7e8c1e1 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,7 +1,7 @@
 from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import AutoConfig
 from transformers.models.auto import CONFIG_MAPPING, AutoConfig
 
+
 class GraniteSpeechEncoderConfig(PretrainedConfig):
     model_type = "granite_speech_encoder"
 
@@ -35,9 +35,11 @@ def __init__(
         self.conv_expansion_factor = conv_expansion_factor
         self.use_max_pos_emb_in_pos_emb_calc = use_max_pos_emb_in_pos_emb_calc
 
+
 ## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
 class GraniteSpeechProjectorConfig(PretrainedConfig):
     model_type = "blip_2_qformer"
+
     def __init__(
         self,
         llm_dim=4096,
@@ -113,7 +115,7 @@ def __init__(
             projector_config = GraniteSpeechProjectorConfig()
 
         if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
-            encoder_config = dict() if encoder_config is None else encoder_config
+            encoder_config = {} if encoder_config is None else encoder_config
             encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
 
         self.text_config = text_config
@@ -124,4 +126,5 @@ def __init__(
         self.has_lora_adapter = has_lora_adapter
         super().__init__(**kwargs)
 
-__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
\ No newline at end of file
+
+__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index 79c7b18216aa..ebf145a8e39f 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -15,10 +15,13 @@
 """
 Feature extractor class for Speech Granite
 """
-from typing import Optional, List
+
 import math
+from typing import List, Optional
+
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from transformers.utils import logging, is_torch_available, is_torchaudio_available
+from transformers.utils import is_torch_available, is_torchaudio_available, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -28,6 +31,7 @@
 if is_torchaudio_available():
     import torchaudio
 
+
 class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
     model_input_names = ["input_features"]
 
@@ -48,7 +52,7 @@ def __init__(
             "n_fft": n_fft,
             "win_length": win_length,
             "hop_length": hop_length,
-            "n_mels": n_mels
+            "n_mels": n_mels,
         }
         # HACK - for now, lazily initialize the mel spectrogram transform;
         # the feature extractor mixin explodes otherwise because
@@ -60,14 +64,12 @@ def __init__(
 
     def _ensure_melspec_transform_is_initialized(self):
         if self.melspec is None:
-            self.melspec = torchaudio.transforms.MelSpectrogram(
-                **self.melspec_kwargs
-            )
+            self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
 
     def __call__(
         self,
         x: torch.Tensor,
-        device: Optional[str]="cpu",
+        device: Optional[str] = "cpu",
     ) -> BatchFeature:
         # TODO there is probably a better way to do both of these things...
         self._ensure_melspec_transform_is_initialized()
@@ -80,18 +82,17 @@ def __call__(
         B, _ = x.shape
         with torch.no_grad():
             mel = melspec(x.float())
-            logmel = mel.transpose(-1,-2).clip_(min=1e-10).log10_()
-            mx = logmel.amax(dim=(-2,-1), keepdim=True)
+            logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
+            mx = logmel.amax(dim=(-2, -1), keepdim=True)
             logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
             if logmel.shape[1] % 2 == 1:
-                logmel = logmel[:,:-1]                       # remove last frame if odd
+                logmel = logmel[:, :-1]  # remove last frame if odd
             x = logmel.reshape(B, -1, 2 * logmel.shape[-1])  # stacking and skipping by 2
 
         if x.device != "cpu":
             return x.detach().cpu()
         return x
 
-    
     def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
         """
         Gets the (variable length) variable length number of features
@@ -110,8 +111,8 @@ def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
             # projector output length
             projector_length = nblocks * effective_window_size
             projector_lengths.append(projector_length)
-        
+
         return projector_lengths
 
-        
-__all__ = ["GraniteSpeechFeatureExtractor"]
\ No newline at end of file
+
+__all__ = ["GraniteSpeechFeatureExtractor"]
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 395733e076c4..f60940d4e9cd 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1,17 +1,22 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
-from torch import nn, einsum
 import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import einsum, nn
+
 from transformers.activations import ACT2FN
-from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
 from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import ModelOutput, BaseModelOutputWithPoolingAndCrossAttentions, BaseModelOutputWithPastAndCrossAttentions
 from transformers.models.auto import AutoModelForCausalLM
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -26,10 +31,12 @@
     GraniteSpeechProjectorConfig,
 )
 
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GraniteSpeechConfig"
 
+
 @dataclass
 class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
     """
@@ -58,6 +65,7 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
     """
+
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
@@ -72,6 +80,7 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
 # which will should involve pulling it out into its own dir so that it is accessible
 # under transformers.models.X.
 
+
 # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->GraniteSpeech
 class GraniteSpeechQFormerMultiHeadAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
@@ -219,6 +228,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+
 # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->GraniteSpeech
 class GraniteSpeechQFormerAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
@@ -268,6 +278,7 @@ def forward(
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
+
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GraniteSpeechQFormer
 class GraniteSpeechQFormerIntermediate(nn.Module):
     def __init__(self, config):
@@ -298,19 +309,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GraniteSpeechQFormer
-class GraniteSpeechQFormerOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
 
 # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerLayer with Blip2->GraniteSpeech
 class GraniteSpeechQFormerLayer(nn.Module):
@@ -416,6 +414,7 @@ def feed_forward_chunk_query(self, attention_output):
         layer_output = self.output_query(intermediate_output, attention_output)
         return layer_output
 
+
 # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->GraniteSpeech
 class GraniteSpeechQFormerEncoder(nn.Module):
     def __init__(self, config):
@@ -740,10 +739,13 @@ def forward(
             attentions=encoder_outputs.attentions,
             cross_attentions=encoder_outputs.cross_attentions,
         )
+
+
 # TODO (alex) - refactor GraniteSpeechQformer to be available under
 # transformers.models.X, delete all of the code above, and
 # create the model through AutoModel.
 
+
 class GraniteSpeechEncoderProjectorQFormer(nn.Module):
     def __init__(self, config: GraniteSpeechProjectorConfig):
         super().__init__()
@@ -774,20 +776,19 @@ def forward(self, x, atts):
             return_dict=True,
         )
         query_proj = self.linear(
-            query_output.last_hidden_state.view(
-                batch_size, nblocks * self.window_size // self.ds_rate, -1
-            )
+            query_output.last_hidden_state.view(batch_size, nblocks * self.window_size // self.ds_rate, -1)
         )
         return query_proj
 
+
 ### Encoder
 class GraniteSpeechCTCModel(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super(GraniteSpeechCTCModel, self).__init__()
 
         self.rnn_tr = nn.ModuleList(
-            [nn.Linear(config.input_dim, config.hidden_dim, bias=True)] + \
-            [
+            [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
+            + [
                 GraniteSpeechConformerBlock(
                     dim=config.hidden_dim,
                     dim_head=config.dim_head,
@@ -800,7 +801,8 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
                     ff_dropout=config.dropout,
                     conv_dropout=config.dropout,
                     use_max_pos_emb_in_pos_emb_calc=config.use_max_pos_emb_in_pos_emb_calc,
-                ) for layer_idx in range(config.num_layers)
+                )
+                for layer_idx in range(config.num_layers)
             ]
         )
 
@@ -828,7 +830,7 @@ class GraniteSpeechConformerPermute(nn.Module):
     def __init__(self, dims):
         super().__init__()
         self.dims = dims
-        
+
     def forward(self, x):
         x = x.permute(self.dims)
         return x
@@ -838,7 +840,7 @@ class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     def __init__(self, chan_in, chan_out, kernel_size, padding):
         super().__init__()
         self.padding = padding
-        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in, bias=False)
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
 
     def forward(self, x):
         x = F.pad(x, self.padding)
@@ -883,18 +885,18 @@ def __init__(
         dim,
         heads=8,
         dim_head=64,
-        dropout=0.,
+        dropout=0.0,
         context_size=200,
         max_pos_emb=512,
         use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
         inner_dim = dim_head * heads
-        self.heads= heads
+        self.heads = heads
         self.dim_head = dim_head
-        self.scale = dim_head ** -0.5
-        self.to_q = nn.Linear(dim, inner_dim, bias = False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.scale = dim_head**-0.5
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, dim)
 
         self.max_pos_emb = max_pos_emb
@@ -906,7 +908,7 @@ def __init__(
     def forward(self, x, context_size):
         device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
         bs, n, d = x.shape
-        assert(context_size > 0 and context_size <= max_pos_emb)
+        assert context_size > 0 and context_size <= max_pos_emb
 
         nb = math.ceil(n / context_size)
         nr = n % context_size
@@ -914,50 +916,39 @@ def forward(self, x, context_size):
             # right padding to reach block size
             x = torch.nn.functional.pad(x, (0, 0, 0, context_size - nr))
 
-        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
-        q, k, v = map(
-            lambda t: t.reshape(bs, nb, context_size, h, -1).transpose(2, 3),
-            (q, k, v),
-        )
-        dots = einsum('b m h i d, b m h j d -> b m h i j', q, k) * self.scale
+        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim=-1))
+        q, k, v = [t.reshape(bs, nb, context_size, h, -1).transpose(2, 3) for t in (q, k, v)]
+
+        dots = einsum("b m h i d, b m h j d -> b m h i j", q, k) * self.scale
 
         # shaw's relative positional embedding
-        seq = torch.arange(context_size, device = device)
+        seq = torch.arange(context_size, device=device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist,-context_size, context_size) + self.offset
+        dist = torch.clamp(dist, -context_size, context_size) + self.offset
         rel_pos_emb = self.rel_pos_emb(dist).to(q)
-        pos_attn = einsum('b m h c d, c r d -> b m h c r', q, rel_pos_emb) * self.scale
+        pos_attn = einsum("b m h c d, c r d -> b m h c r", q, rel_pos_emb) * self.scale
         dots = dots + pos_attn
 
         if nr > 0:
             # masked attention in the extended block
             mask = torch.ones(context_size, context_size, dtype=bool, device=device)
-            mask[:nr,:nr] = 0
+            mask[:nr, :nr] = 0
             mask_value = -torch.finfo(dots.dtype).max
-            dots[:,-1,:].masked_fill_(mask, mask_value)
+            dots[:, -1, :].masked_fill_(mask, mask_value)
 
-        attn = dots.softmax(dim = -1)
+        attn = dots.softmax(dim=-1)
 
-        out = einsum('b m h i j, b m h j d -> b m h i d', attn, v)
+        out = einsum("b m h i j, b m h j d -> b m h i d", attn, v)
         out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
-        out = self.to_out(out[:,:n,:])
+        out = self.to_out(out[:, :n, :])
         return self.dropout(out)
 
 
 class GraniteSpeechConformerFeedForward(nn.Module):
-    def __init__(
-        self,
-        dim,
-        mult=4,
-        dropout=0.
-    ):
+    def __init__(self, dim, mult=4, dropout=0.0):
         super().__init__()
         self.net = nn.Sequential(
-            nn.Linear(dim, dim * mult),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim * mult, dim),
-            nn.Dropout(dropout)
+            nn.Linear(dim, dim * mult), nn.SiLU(), nn.Dropout(dropout), nn.Linear(dim * mult, dim), nn.Dropout(dropout)
         )
 
     def forward(self, x):
@@ -965,13 +956,7 @@ def forward(self, x):
 
 
 class GraniteSpeechConformerConvModule(nn.Module):
-    def __init__(
-        self,
-        dim,
-        causal=False,
-        expansion_factor=2,
-        kernel_size=31,
-        dropout=0.):
+    def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0):
         super().__init__()
 
         inner_dim = dim * expansion_factor
@@ -982,15 +967,12 @@ def __init__(
             GraniteSpeechConformerPermute(dims=(0, 2, 1)),
             nn.Conv1d(dim, inner_dim * 2, 1),
             nn.GLU(dim=1),
-            GraniteSpeechConformerDepthWiseConv1d(inner_dim,
-                            inner_dim,
-                            kernel_size=kernel_size,
-                            padding=padding),
+            GraniteSpeechConformerDepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding),
             nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
             nn.SiLU(),
             nn.Conv1d(inner_dim, dim, 1),
             GraniteSpeechConformerPermute(dims=(0, 2, 1)),
-            nn.Dropout(dropout)
+            nn.Dropout(dropout),
         )
 
     def forward(self, x):
@@ -1013,9 +995,9 @@ def __init__(
         conv_expansion_factor=2,
         conv_kernel_size=31,
         context_size=-1,
-        attn_dropout=0.,
-        ff_dropout=0.,
-        conv_dropout=0.,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        conv_dropout=0.0,
         use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
@@ -1052,7 +1034,6 @@ def forward(self, x, context_size):
         return x
 
 
-
 GRANITE_SPEECH_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -1068,6 +1049,8 @@ def forward(self, x, context_size):
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
+
+
 @add_start_docstrings(
     "The bare Granite Speech Model outputting raw hidden-states without any specific head on top.",
     GRANITE_SPEECH_START_DOCSTRING,
@@ -1275,14 +1258,12 @@ def forward(
             if input_features.dtype != self.dtype:
                 logger.warning(f"input features are casted to {self.dtype}")
                 input_features = input_features.to(self.dtype)
-            # Get the audio features from the encoder / projector 
+            # Get the audio features from the encoder / projector
             audio_features = self.get_audio_features(input_features)
 
             # Merge the audio features into the LLM embeddings
             inputs_embeds = self.get_merged_audio_embeddings(
-                input_ids=input_ids,
-                audio_features=audio_features,
-                input_features_mask=input_features_mask
+                input_ids=input_ids, audio_features=audio_features, input_features_mask=input_features_mask
             )
 
         outputs = self.language_model(
@@ -1292,7 +1273,7 @@ def forward(
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states, 
+            output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
@@ -1330,7 +1311,6 @@ def forward(
             attentions=outputs.attentions,
         )
 
-
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -1372,9 +1352,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         """
         is_audio_index = input_ids == self.config.audio_token_index
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
-        inputs_embeds = self.language_model.get_input_embeddings()(
-            llm_input_ids
-        )  # [bsz, # features, hidden size]
+        inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
 
         # Mask the audio features into the text embeddings
         special_audio_mask = is_audio_index.unsqueeze(-1)
@@ -1399,6 +1377,7 @@ def generate(self, *args, **kwargs):
                 self.disable_adapters()
         return super().generate(*args, input_features=input_features, **kwargs)
 
+
 __all__ = [
     "GraniteSpeechForConditionalGeneration",
     "GraniteSpeechPreTrainedModel",
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 6e3a2a6e4b91..1d4c0cfbfe62 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -15,11 +15,13 @@
 """
 Processor class for Speech Granite.
 """
+
 from collections.abc import Sequence
 from typing import List, Union
 
 import numpy as np
 import torch
+
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils import PreTokenizedInput, TextInput
@@ -30,7 +32,6 @@
 
 
 class GraniteSpeechProcessor(ProcessorMixin):
-
     attributes = ["feature_extractor", "tokenizer"]
     valid_kwargs = ["audio_token"]
 
@@ -53,36 +54,34 @@ def __call__(
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:
-
         speech_inputs = {}
         text_inputs = {}
 
         text = self._get_validated_text(text)
         expected_num_audios = sum(t.count(self.audio_token) for t in text)
-        
+
         if audios is not None:
             audios, audio_lengths = self._get_validated_audios(audios)
             if any(text.count(self.audio_token) != 1 for text in text):
                 raise ValueError("Only one audio sample is currently supported per input")
             if len(audio_lengths) != expected_num_audios:
                 raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
-            
+
             # Calculate Mel features & the number of placeholders we will need
             speech_inputs["input_features"] = self.feature_extractor(
                 audios,
                 device=device,
             )
-            num_audio_features = self.feature_extractor._get_num_audio_features(
-                audio_lengths
-            )
-            speech_inputs["input_features_mask"] = torch.arange(max(num_audio_features)).view(1, -1) <= \
-                torch.tensor(num_audio_features).view(-1, 1)
-            
+            num_audio_features = self.feature_extractor._get_num_audio_features(audio_lengths)
+            speech_inputs["input_features_mask"] = torch.arange(max(num_audio_features)).view(1, -1) <= torch.tensor(
+                num_audio_features
+            ).view(-1, 1)
+
             # duplicate the audio placeholders to match the feature dims
             text = self._expand_audio_placeholders(text, num_audio_features)
         else:
             assert expected_num_audios == 0, "No audio is provided, expecting no audio tokens"
-        
+
         text_inputs = self.tokenizer(text, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **speech_inputs})
 
@@ -103,7 +102,7 @@ def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[i
                 )
                 num_replaced += 1
             prompt_strings.append(sample)
-        
+
         prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
         return prompt_strings
 
@@ -128,12 +127,12 @@ def _get_validated_audios(self, audios):
                 audios = audios.unsqueeze(0)
             if not torch.is_floating_point(audios):
                 raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
-            
+
             if audios.shape[0] > 1:
                 logger.warning("Audio samples are already collated; assuming they all have the same length")
             lengths = [audios.shape[-1]] * audios.shape[0]
             return audios, lengths
-        
+
         elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
             if not torch.is_floating_point(audios[0]):
                 raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
@@ -144,8 +143,8 @@ def _get_validated_audios(self, audios):
             padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
             audios = torch.cat(padded, dim=0)
             return audios, lengths
-        
+
         raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
 
 
-__all__ = ["GraniteSpeechProcessor"]
\ No newline at end of file
+__all__ = ["GraniteSpeechProcessor"]
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 65c700bbbd12..69ea0e878bda 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Testing suite for the IBM Granite Speech model."""
-import unittest
-from ...generation.test_utils import GenerationTesterMixin
+
 import tempfile
+import unittest
+
 from transformers import (
     GraniteSpeechConfig,
     GraniteSpeechForConditionalGeneration,
@@ -23,22 +24,24 @@
 )
 from transformers.testing_utils import (
     require_torch,
-    torch_device,
     require_torch_sdpa,
+    torch_device,
 )
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
+    _config_zero_init,
     floats_tensor,
     ids_tensor,
-    _config_zero_init,
-    torch_device,
 )
-from ...test_configuration_common import ConfigTester
 
 
 if is_torch_available():
     import torch
 
+
 class GraniteSpeechForConditionalGenerationModelTester:
     def __init__(
         self,
@@ -56,7 +59,7 @@ def __init__(
             "input_dim": 160,
             "num_heads": 4,
             "num_layers": 2,
-            "output_dim": 42
+            "output_dim": 42,
         },
         text_config={
             "model_type": "granite",
@@ -99,7 +102,7 @@ def __init__(
             "position_embedding_type": "absolute",
             "use_qformer_text_input": False,
             "vocab_size": 30522,
-            "window_size": 15
+            "window_size": 15,
         },
         audio_token_index=0,
         tie_word_embeddings=True,
@@ -117,7 +120,10 @@ def __init__(
         self.initializer_range = initializer_range
         self.has_lora_adapater = has_lora_adapter
         self.is_training = is_training
- 
+
+        # Dims for audio features
+        self.sequence_dim = 844
+        self.feaure_dim = 160
         self.num_attention_heads = text_config["num_attention_heads"]
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.hidden_size = text_config["hidden_size"]
@@ -127,7 +133,6 @@ def __init__(
         self.num_audio_tokens = 2
         self.seq_length = seq_length + self.num_audio_tokens
 
-
     def get_config(self):
         return GraniteSpeechConfig(
             encoder_config=self.encoder_config,
@@ -139,16 +144,13 @@ def get_config(self):
             has_lora_adapter=self.has_lora_adapater,
         )
 
-
     def prepare_config_and_inputs(self):
         input_features = floats_tensor(
-            # TODO - clean this up
-            [self.batch_size, 844, 160],
+            [self.batch_size, self.sequence_dim, self.feature_dim],
         )
         config = self.get_config()
         return config, input_features
 
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_features = config_and_inputs
@@ -165,9 +167,7 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_granite_speech_model_fp16_forward(
-        self, config, input_ids, input_features, attention_mask
-    ):
+    def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask):
         model = GraniteSpeechForConditionalGeneration(config=config)
         model.to(torch_device)
         model.half()
@@ -181,7 +181,11 @@ def create_and_check_granite_speech_model_fp16_forward(
         self.parent.assertFalse(torch.isnan(logits).any().item())
 
     def create_and_check_granite_speech_model_fp16_autocast_forward(
-        self, config, input_ids, input_features, attention_mask,
+        self,
+        config,
+        input_ids,
+        input_features,
+        attention_mask,
     ):
         config.torch_dtype = torch.float16
         model = GraniteSpeechForConditionalGeneration(config=config)
@@ -197,7 +201,6 @@ def create_and_check_granite_speech_model_fp16_autocast_forward(
         self.parent.assertFalse(torch.isnan(logits).any().item())
 
 
-
 @require_torch
 class GraniteSpeechForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
@@ -254,7 +257,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         # overwrite because Granite Speech is audio+text model (not vision+text)
diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index 29e44c3d3d01..eb16883484cc 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -11,28 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import pytest
+import shutil
 import tempfile
 import unittest
-import shutil
-from parameterized import parameterized
 
 import numpy as np
+import pytest
 import torch
-from transformers import AutoTokenizer, GPT2TokenizerFast
+from parameterized import parameterized
 
+from transformers import AutoTokenizer, GPT2TokenizerFast
 from transformers.testing_utils import (
     require_torch,
-    require_torchaudio,
     require_torch_gpu,
+    require_torchaudio,
 )
 from transformers.utils import is_torchaudio_available
 
 
 if is_torchaudio_available():
-    from transformers import GraniteSpeechProcessor, GraniteSpeechFeatureExtractor
+    from transformers import GraniteSpeechFeatureExtractor, GraniteSpeechProcessor
 
 pytest.skip("Public models not yet available", allow_module_level=True)
+
+
 @require_torch
 @require_torchaudio
 class GraniteSpeechProcessorTest(unittest.TestCase):
@@ -127,10 +129,12 @@ def test_nested_bad_audios_fails(self):
         with pytest.raises(TypeError):
             processor(text=None, audios=["foo"])
 
-    @parameterized.expand([
-        ([1, 269920], [171], torch.rand),
-        ([1, 269920], [171], np.random.rand),
-    ])
+    @parameterized.expand(
+        [
+            ([1, 269920], [171], torch.rand),
+            ([1, 269920], [171], np.random.rand),
+        ]
+    )
     def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features, random_func):
         """Ensure audio token filling is handled correctly when we have
         one or more audio inputs whose features are all the same length
@@ -144,14 +148,10 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
             tokenizer=tokenizer,
             feature_extractor=feature_extractor,
         )
-        audios = random_func(*vec_dims) - .5
+        audios = random_func(*vec_dims) - 0.5
 
         audio_tokens = processor.audio_token * vec_dims[0]
-        inputs = processor(
-            text=f"{audio_tokens} Can you compare this audio?",
-            audios=audios,
-            return_tensors="pt"
-        )
+        inputs = processor(text=f"{audio_tokens} Can you compare this audio?", audios=audios, return_tensors="pt")
 
         # Check the number of audio tokens
         audio_token_id = tokenizer.get_vocab()[processor.audio_token]
@@ -176,7 +176,7 @@ def test_audio_token_filling_varying_len_feature_list(self):
         )
         vec_dims = [[1, 142100], [1, 269920]]
         num_expected_features = [90, 171]
-        audios = [torch.rand(dims) - .5 for dims in vec_dims]
+        audios = [torch.rand(dims) - 0.5 for dims in vec_dims]
 
         inputs = processor(
             text=[
@@ -184,7 +184,7 @@ def test_audio_token_filling_varying_len_feature_list(self):
                 f"{processor.audio_token} How does it compare with this audio?",
             ],
             audios=audios,
-            return_tensors="pt"
+            return_tensors="pt",
         )
 
         # Check the number of audio tokens
@@ -211,7 +211,7 @@ def test_device_override(self):
         )
 
         vec_dims = [1, 269920]
-        wav = torch.rand(vec_dims) - .5
+        wav = torch.rand(vec_dims) - 0.5
 
         inputs = processor(
             text=f"{processor.audio_token} Can you transcribe this audio?",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a34a6ca4e6cc..55a63112c482 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -110,7 +110,7 @@
         "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
         "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
         "GraphormerDecoderHead",  # Building part of bigger (tested) model.
-        "GraniteSpeechQFormerModel", # Building part of bigger (tested) model.
+        "GraniteSpeechQFormerModel",  # Building part of bigger (tested) model.
         "JukeboxVQVAE",  # Building part of bigger (tested) model.
         "JukeboxPrior",  # Building part of bigger (tested) model.
         "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.

From b0fe344ea1bafa7027312c7edc916ef0077d75de Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 19:22:06 +0000
Subject: [PATCH 063/116] Rerun torch dummy obj regen

---
 src/transformers/utils/dummy_torchaudio_objects.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/utils/dummy_torchaudio_objects.py b/src/transformers/utils/dummy_torchaudio_objects.py
index 73cec412119e..f7aea4a70207 100644
--- a/src/transformers/utils/dummy_torchaudio_objects.py
+++ b/src/transformers/utils/dummy_torchaudio_objects.py
@@ -2,28 +2,28 @@
 from ..utils import DummyObject, requires_backends
 
 
-class MusicgenMelodyFeatureExtractor(metaclass=DummyObject):
+class GraniteSpeechFeatureExtractor(metaclass=DummyObject):
     _backends = ["torchaudio"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchaudio"])
 
 
-class MusicgenMelodyProcessor(metaclass=DummyObject):
+class GraniteSpeechProcessor(metaclass=DummyObject):
     _backends = ["torchaudio"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchaudio"])
 
 
-class GraniteSpeechFeatureExtractor(metaclass=DummyObject):
+class MusicgenMelodyFeatureExtractor(metaclass=DummyObject):
     _backends = ["torchaudio"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchaudio"])
 
 
-class GraniteSpeechProcessor(metaclass=DummyObject):
+class MusicgenMelodyProcessor(metaclass=DummyObject):
     _backends = ["torchaudio"]
 
     def __init__(self, *args, **kwargs):

From 42b940c54296619801013f744c10f30705bb36fd Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 19:42:24 +0000
Subject: [PATCH 064/116] Fix save pretrained for granite speech

---
 .../granite_speech/configuration_granite_speech.py   |  6 ++----
 .../models/granite_speech/modeling_granite_speech.py | 12 ++++++++++++
 .../granite_speech/test_modeling_granite_speech.py   |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 1fc6a7e8c1e1..960bf13ab6a6 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -38,7 +38,7 @@ def __init__(
 
 ## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
 class GraniteSpeechProjectorConfig(PretrainedConfig):
-    model_type = "blip_2_qformer"
+    model_type = "granite_speech_qformer"
 
     def __init__(
         self,
@@ -107,9 +107,7 @@ def __init__(
             text_config = CONFIG_MAPPING["granite"]()
 
         if isinstance(projector_config, dict):
-            # TODO - Make this generic after blip2qformer is moved out to its own model dir.
-            if projector_config["model_type"] != "blip_2_qformer":
-                raise ValueError("Granite speech currently requires blip2 qformer as its encoder!")
+            # TODO - In the future, we should make this generic.
             projector_config = GraniteSpeechProjectorConfig(**projector_config)
         elif projector_config is None:
             projector_config = GraniteSpeechProjectorConfig()
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index f60940d4e9cd..62666d5ce352 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1377,6 +1377,18 @@ def generate(self, *args, **kwargs):
                 self.disable_adapters()
         return super().generate(*args, input_features=input_features, **kwargs)
 
+    def save_pretrained(self, *args, **kwargs):
+        # overwrite save_pretrained to first save the adapter if we have one
+        # NOTE - this will use the base model path we are exporting in the lora
+        # adapter, which may not necessarily be the best behavior, but for now
+        # we keep this for portability, since using the local dir causes problems
+        # if the model is loaded from outside of the current working dir.
+        if is_peft_available and self._hf_peft_config_loaded:
+            super().save_pretrained(*args, **kwargs)
+        # Then save the base model afterwards
+        self._hf_peft_config_loaded = False
+        super().save_pretrained(*args, **kwargs)
+
 
 __all__ = [
     "GraniteSpeechForConditionalGeneration",
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 69ea0e878bda..380014e8926c 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -96,7 +96,7 @@ def __init__(
             "layer_norm_eps": 1e-12,
             "llm_dim": 32,
             "max_position_embeddings": 2048,
-            "model_type": "blip_2_qformer",
+            "model_type": "granite_speech_qformer",
             "num_attention_heads": 4,
             "num_hidden_layers": 2,
             "position_embedding_type": "absolute",

From 02fc57ad170be9c2e71faf0c6755c16f8ffe88f9 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Mon, 24 Mar 2025 19:46:53 +0000
Subject: [PATCH 065/116] Import sorting

---
 src/transformers/__init__.py | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2d054920196c..d4709337c9ec 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -530,6 +530,51 @@
     _import_structure["tf_utils"] = []
 
 
+<<<<<<< HEAD
+=======
+try:
+    if not (
+        is_librosa_available()
+        and is_essentia_available()
+        and is_scipy_available()
+        and is_torch_available()
+        and is_pretty_midi_available()
+    ):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import (
+        dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects,
+    )
+
+    _import_structure["utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects"] = [
+        name
+        for name in dir(dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects)
+        if not name.startswith("_")
+    ]
+else:
+    _import_structure["models.pop2piano"].append("Pop2PianoFeatureExtractor")
+    _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
+    _import_structure["models.pop2piano"].append("Pop2PianoProcessor")
+
+try:
+    if not is_torchaudio_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import (
+        dummy_torchaudio_objects,
+    )
+
+    _import_structure["utils.dummy_torchaudio_objects"] = [
+        name for name in dir(dummy_torchaudio_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["models.granite_speech"].append("GraniteSpeechFeatureExtractor")
+    _import_structure["models.granite_speech"].append("GraniteSpeechProcessor")
+
+    _import_structure["models.musicgen_melody"].append("MusicgenMelodyFeatureExtractor")
+    _import_structure["models.musicgen_melody"].append("MusicgenMelodyProcessor")
+
+>>>>>>> 1a8b4a742 (Import sorting)
 # FLAX-backed objects
 try:
     if not is_flax_available():

From f7e53ed98cdfa97815f4b7e62062559631e844a3 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Tue, 25 Mar 2025 16:27:10 +0000
Subject: [PATCH 066/116] Fix tests typo

---
 tests/models/granite_speech/test_modeling_granite_speech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 380014e8926c..412e3114c9b5 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -123,7 +123,7 @@ def __init__(
 
         # Dims for audio features
         self.sequence_dim = 844
-        self.feaure_dim = 160
+        self.feature_dim = 160
         self.num_attention_heads = text_config["num_attention_heads"]
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.hidden_size = text_config["hidden_size"]

From 7853cc785aa3b81f81bc3a20db680d926685d366 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Tue, 25 Mar 2025 16:35:09 +0000
Subject: [PATCH 067/116] Remove offset hack

---
 .../models/granite_speech/configuration_granite_speech.py  | 2 --
 .../models/granite_speech/modeling_granite_speech.py       | 7 +------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 960bf13ab6a6..9aa1d2aad65d 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -18,7 +18,6 @@ def __init__(
         dropout=0.1,
         conv_kernel_size=15,
         conv_expansion_factor=2,
-        use_max_pos_emb_in_pos_emb_calc=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -33,7 +32,6 @@ def __init__(
         self.dropout = dropout
         self.conv_kernel_size = conv_kernel_size
         self.conv_expansion_factor = conv_expansion_factor
-        self.use_max_pos_emb_in_pos_emb_calc = use_max_pos_emb_in_pos_emb_calc
 
 
 ## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 62666d5ce352..297d60733eb5 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -800,7 +800,6 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
                     attn_dropout=config.dropout,
                     ff_dropout=config.dropout,
                     conv_dropout=config.dropout,
-                    use_max_pos_emb_in_pos_emb_calc=config.use_max_pos_emb_in_pos_emb_calc,
                 )
                 for layer_idx in range(config.num_layers)
             ]
@@ -888,7 +887,6 @@ def __init__(
         dropout=0.0,
         context_size=200,
         max_pos_emb=512,
-        use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
         inner_dim = dim_head * heads
@@ -903,7 +901,6 @@ def __init__(
         self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
 
         self.dropout = nn.Dropout(dropout)
-        self.offset = max_pos_emb if use_max_pos_emb_in_pos_emb_calc else context_size
 
     def forward(self, x, context_size):
         device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
@@ -924,7 +921,7 @@ def forward(self, x, context_size):
         # shaw's relative positional embedding
         seq = torch.arange(context_size, device=device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist, -context_size, context_size) + self.offset
+        dist = torch.clamp(dist, -context_size, context_size) + max_pos_emb
         rel_pos_emb = self.rel_pos_emb(dist).to(q)
         pos_attn = einsum("b m h c d, c r d -> b m h c r", q, rel_pos_emb) * self.scale
         dots = dots + pos_attn
@@ -998,7 +995,6 @@ def __init__(
         attn_dropout=0.0,
         ff_dropout=0.0,
         conv_dropout=0.0,
-        use_max_pos_emb_in_pos_emb_calc=True,
     ):
         super().__init__()
         self.ff1 = GraniteSpeechConformerFeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
@@ -1008,7 +1004,6 @@ def __init__(
             heads=heads,
             dropout=attn_dropout,
             context_size=context_size,
-            use_max_pos_emb_in_pos_emb_calc=use_max_pos_emb_in_pos_emb_calc,
         )
         self.conv = GraniteSpeechConformerConvModule(
             dim=dim,

From 19b41bbe5edf423e28517a23c1bca23281d6e509 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Wed, 26 Mar 2025 10:11:51 +0000
Subject: [PATCH 068/116] Pass args through encoder config

---
 .../granite_speech/modeling_granite_speech.py | 114 ++++++------------
 1 file changed, 39 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 297d60733eb5..a3d1cfdff944 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -790,16 +790,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
             [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
             + [
                 GraniteSpeechConformerBlock(
-                    dim=config.hidden_dim,
-                    dim_head=config.dim_head,
-                    heads=config.num_heads,
-                    ff_mult=config.feedforward_mult,
-                    conv_expansion_factor=config.conv_expansion_factor,
-                    conv_kernel_size=config.conv_kernel_size,
-                    context_size=config.context_size,  # attention context size
-                    attn_dropout=config.dropout,
-                    ff_dropout=config.dropout,
-                    conv_dropout=config.dropout,
+                    config,
                 )
                 for layer_idx in range(config.num_layers)
             ]
@@ -879,31 +870,23 @@ def forward(self, x, context_size, **kwargs):
 
 
 class GraniteSpeechConformerAttention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        context_size=200,
-        max_pos_emb=512,
-    ):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        inner_dim = dim_head * heads
-        self.heads = heads
-        self.dim_head = dim_head
-        self.scale = dim_head**-0.5
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim)
+        self.num_heads = config.num_heads
+        inner_dim = config.dim_head * self.num_heads
+        self.dim_head = config.dim_head
+        self.scale = self.dim_head**-0.5
+        self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, config.hidden_dim)
 
-        self.max_pos_emb = max_pos_emb
-        self.rel_pos_emb = nn.Embedding(2 * max_pos_emb + 1, dim_head)
+        self.max_pos_emb = 512
+        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
 
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(config.dropout)
 
     def forward(self, x, context_size):
-        device, h, max_pos_emb = x.device, self.heads, self.max_pos_emb
+        device, h, max_pos_emb = x.device, self.num_heads, self.max_pos_emb
         bs, n, d = x.shape
         assert context_size > 0 and context_size <= max_pos_emb
 
@@ -942,10 +925,14 @@ def forward(self, x, context_size):
 
 
 class GraniteSpeechConformerFeedForward(nn.Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
         self.net = nn.Sequential(
-            nn.Linear(dim, dim * mult), nn.SiLU(), nn.Dropout(dropout), nn.Linear(dim * mult, dim), nn.Dropout(dropout)
+            nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult),
+            nn.SiLU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim),
+            nn.Dropout(config.dropout),
         )
 
     def forward(self, x):
@@ -953,23 +940,25 @@ def forward(self, x):
 
 
 class GraniteSpeechConformerConvModule(nn.Module):
-    def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-
-        inner_dim = dim * expansion_factor
-        padding = self.calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+        causal = False
+        inner_dim = config.hidden_dim * config.conv_expansion_factor
+        padding = self.calc_same_padding(config.conv_kernel_size) if not causal else (config.conv_kernel_size - 1, 0)
 
         self.net = nn.Sequential(
-            nn.LayerNorm(dim),
+            nn.LayerNorm(config.hidden_dim),
             GraniteSpeechConformerPermute(dims=(0, 2, 1)),
-            nn.Conv1d(dim, inner_dim * 2, 1),
+            nn.Conv1d(config.hidden_dim, inner_dim * 2, 1),
             nn.GLU(dim=1),
-            GraniteSpeechConformerDepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=padding),
+            GraniteSpeechConformerDepthWiseConv1d(
+                inner_dim, inner_dim, kernel_size=config.conv_kernel_size, padding=padding
+            ),
             nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
             nn.SiLU(),
-            nn.Conv1d(inner_dim, dim, 1),
+            nn.Conv1d(inner_dim, config.hidden_dim, 1),
             GraniteSpeechConformerPermute(dims=(0, 2, 1)),
-            nn.Dropout(dropout),
+            nn.Dropout(config.dropout),
         )
 
     def forward(self, x):
@@ -982,43 +971,18 @@ def calc_same_padding(kernel_size: int):
 
 
 class GraniteSpeechConformerBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head=64,
-        heads=8,
-        ff_mult=2,
-        conv_expansion_factor=2,
-        conv_kernel_size=31,
-        context_size=-1,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        conv_dropout=0.0,
-    ):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        self.ff1 = GraniteSpeechConformerFeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
-        self.attn = GraniteSpeechConformerAttention(
-            dim=dim,
-            dim_head=dim_head,
-            heads=heads,
-            dropout=attn_dropout,
-            context_size=context_size,
-        )
-        self.conv = GraniteSpeechConformerConvModule(
-            dim=dim,
-            causal=False,
-            expansion_factor=conv_expansion_factor,
-            kernel_size=conv_kernel_size,
-            dropout=conv_dropout,
-        )
-        self.ff2 = GraniteSpeechConformerFeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+        self.ff1 = GraniteSpeechConformerFeedForward(config)
+        self.attn = GraniteSpeechConformerAttention(config)
+        self.conv = GraniteSpeechConformerConvModule(config)
+        self.ff2 = GraniteSpeechConformerFeedForward(config)
 
-        self.attn = GraniteSpeechConformerPreNormAttn(dim, self.attn)
-        self.ff1 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(dim, self.ff1))
-        self.ff2 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(dim, self.ff2))
+        self.attn = GraniteSpeechConformerPreNormAttn(config.hidden_dim, self.attn)
+        self.ff1 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff1))
+        self.ff2 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff2))
 
-        self.post_norm = nn.LayerNorm(dim)
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
 
     def forward(self, x, context_size):
         x = self.ff1(x) + x

From 7f22f15f8db50a6b8cff2bb525adf677a4c5d8ce Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.brooks@ibm.com>
Date: Wed, 26 Mar 2025 10:31:06 +0000
Subject: [PATCH 069/116] Remove unused prune heads from blip2

---
 .../models/granite_speech/modeling_granite_speech.py     | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index a3d1cfdff944..bc47d95bfe70 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -568,15 +568,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel._prune_heads
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
     # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.get_extended_attention_mask
     def get_extended_attention_mask(
         self,

From 10d2ad9d47db79811cf6e314c854a3b193612580 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 26 Mar 2025 13:51:12 -0400
Subject: [PATCH 070/116] removing einsum. replaced with explicit
 multiplication (relative positional encodings) and sdpa attention.

---
 .../granite_speech/modeling_granite_speech.py     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index bc47d95bfe70..1703cc2711ad 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -890,26 +890,23 @@ def forward(self, x, context_size):
         q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim=-1))
         q, k, v = [t.reshape(bs, nb, context_size, h, -1).transpose(2, 3) for t in (q, k, v)]
 
-        dots = einsum("b m h i d, b m h j d -> b m h i j", q, k) * self.scale
-
         # shaw's relative positional embedding
         seq = torch.arange(context_size, device=device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
         dist = torch.clamp(dist, -context_size, context_size) + max_pos_emb
         rel_pos_emb = self.rel_pos_emb(dist).to(q)
-        pos_attn = einsum("b m h c d, c r d -> b m h c r", q, rel_pos_emb) * self.scale
-        dots = dots + pos_attn
+        rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
+        pos_attn = torch.sum(q.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
 
         if nr > 0:
             # masked attention in the extended block
             mask = torch.ones(context_size, context_size, dtype=bool, device=device)
             mask[:nr, :nr] = 0
-            mask_value = -torch.finfo(dots.dtype).max
-            dots[:, -1, :].masked_fill_(mask, mask_value)
-
-        attn = dots.softmax(dim=-1)
+            mask_value = -torch.finfo(pos_attn.dtype).max
+            pos_attn[:, -1, :].masked_fill_(mask, mask_value)
 
-        out = einsum("b m h i j, b m h j d -> b m h i d", attn, v)
+        with torch.nn.attention.sdpa_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+            out = F.scaled_dot_product_attention(q, k, v, attn_mask=pos_attn, scale=self.scale)
         out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
         out = self.to_out(out[:, :n, :])
         return self.dropout(out)

From 85eaab5b7af8b9ee18ea1c61c6537bc27b451615 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Wed, 26 Mar 2025 14:13:40 -0400
Subject: [PATCH 071/116] remove Sequential from ConformerFeedForward and
 ConformerConvModule. + fix for sdpa attention

---
 .../granite_speech/modeling_granite_speech.py | 65 +++++++++----------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 1703cc2711ad..07b37fcb885b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -807,16 +807,6 @@ def forward(self, x: torch.Tensor):
 
 
 # NOTE: Conformer adapated from: https://github.com/lucidrains/conformer.git
-class GraniteSpeechConformerPermute(nn.Module):
-    def __init__(self, dims):
-        super().__init__()
-        self.dims = dims
-
-    def forward(self, x):
-        x = x.permute(self.dims)
-        return x
-
-
 class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     def __init__(self, chan_in, chan_out, kernel_size, padding):
         super().__init__()
@@ -905,7 +895,7 @@ def forward(self, x, context_size):
             mask_value = -torch.finfo(pos_attn.dtype).max
             pos_attn[:, -1, :].masked_fill_(mask, mask_value)
 
-        with torch.nn.attention.sdpa_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
             out = F.scaled_dot_product_attention(q, k, v, attn_mask=pos_attn, scale=self.scale)
         out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
         out = self.to_out(out[:, :n, :])
@@ -915,42 +905,45 @@ def forward(self, x, context_size):
 class GraniteSpeechConformerFeedForward(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult),
-            nn.SiLU(),
-            nn.Dropout(config.dropout),
-            nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim),
-            nn.Dropout(config.dropout),
-        )
+        self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult)
+        self.act_fn = nn.SiLU()
+        self.dropout = nn.Dropout(config.dropout)
+        self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim)
 
     def forward(self, x):
-        return self.net(x)
+        x = self.up_proj(x)
+        x = self.dropout(self.act_fn(x))
+        x = self.down_proj(x)
+        x = self.dropout(x)
+        return x
 
 
 class GraniteSpeechConformerConvModule(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        causal = False
         inner_dim = config.hidden_dim * config.conv_expansion_factor
-        padding = self.calc_same_padding(config.conv_kernel_size) if not causal else (config.conv_kernel_size - 1, 0)
-
-        self.net = nn.Sequential(
-            nn.LayerNorm(config.hidden_dim),
-            GraniteSpeechConformerPermute(dims=(0, 2, 1)),
-            nn.Conv1d(config.hidden_dim, inner_dim * 2, 1),
-            nn.GLU(dim=1),
-            GraniteSpeechConformerDepthWiseConv1d(
+        padding = self.calc_same_padding(config.conv_kernel_size) 
+
+        self.norm = nn.LayerNorm(config.hidden_dim)
+        self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
+        self.glu = nn.GLU(dim=1)
+        self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
                 inner_dim, inner_dim, kernel_size=config.conv_kernel_size, padding=padding
-            ),
-            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
-            nn.SiLU(),
-            nn.Conv1d(inner_dim, config.hidden_dim, 1),
-            GraniteSpeechConformerPermute(dims=(0, 2, 1)),
-            nn.Dropout(config.dropout),
-        )
+            )
+        self.silu = nn.SiLU()
+        self.batch_norm = nn.BatchNorm1d(inner_dim)
+        self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
+        self.dropout = nn.Dropout(config.dropout)
 
     def forward(self, x):
-        return self.net(x)
+        x = self.norm(x)
+        x = self.up_conv(x.permute(0, 2, 1))
+        x = self.glu(x)
+        x = self.depth_conv(x)
+        x = self.silu(self.batch_norm(x))
+        x = self.down_conv(x).permute(0, 2, 1)
+        x = self.dropout(x)
+        return x
 
     @staticmethod
     def calc_same_padding(kernel_size: int):

From 10ca6eaa636d7b534a2b5602a5bc04b5bde330c8 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 03:43:00 -0400
Subject: [PATCH 072/116] remove GraniteSpeechConformerScale

---
 .../granite_speech/modeling_granite_speech.py   | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 07b37fcb885b..841c1eca3474 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -818,15 +818,6 @@ def forward(self, x):
         return self.conv(x)
 
 
-class GraniteSpeechConformerScale(nn.Module):
-    def __init__(self, scale, fn):
-        super().__init__()
-        self.fn = fn
-        self.scale = scale
-
-    def forward(self, x, **kwargs):
-        return self.fn(x, **kwargs) * self.scale
-
 
 class GraniteSpeechConformerPreNorm(nn.Module):
     def __init__(self, dim, fn):
@@ -960,16 +951,16 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.ff2 = GraniteSpeechConformerFeedForward(config)
 
         self.attn = GraniteSpeechConformerPreNormAttn(config.hidden_dim, self.attn)
-        self.ff1 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff1))
-        self.ff2 = GraniteSpeechConformerScale(0.5, GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff2))
+        self.ff1 = GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff1)
+        self.ff2 = GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff2)
 
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
     def forward(self, x, context_size):
-        x = self.ff1(x) + x
+        x = 0.5 * self.ff1(x) + x
         x = self.attn(x, context_size) + x
         x = self.conv(x) + x
-        x = self.ff2(x) + x
+        x = 0.5 * self.ff2(x) + x
         x = self.post_norm(x)
         return x
 

From ca0c8ea8d6c379d28263804adb7a971686266ebe Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 03:53:49 -0400
Subject: [PATCH 073/116] rename to hidden_states

---
 .../granite_speech/modeling_granite_speech.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 841c1eca3474..b5dd6a53191b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -956,13 +956,13 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
 
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
-    def forward(self, x, context_size):
-        x = 0.5 * self.ff1(x) + x
-        x = self.attn(x, context_size) + x
-        x = self.conv(x) + x
-        x = 0.5 * self.ff2(x) + x
-        x = self.post_norm(x)
-        return x
+    def forward(self, hidden_states, context_size):
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(hidden_states, context_size) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
 
 
 GRANITE_SPEECH_START_DOCSTRING = r"""

From 51f81ae5a55fbc97e128b343225187f0cbfa42c1 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 03:54:49 -0400
Subject: [PATCH 074/116] rename conformer layers to self.layers, remove the
 first linear from the list to keep the list homogenous.

---
 .../granite_speech/modeling_granite_speech.py    | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index b5dd6a53191b..7fbcadd4366f 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -776,15 +776,9 @@ def forward(self, x, atts):
 class GraniteSpeechCTCModel(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super(GraniteSpeechCTCModel, self).__init__()
-
-        self.rnn_tr = nn.ModuleList(
-            [nn.Linear(config.input_dim, config.hidden_dim, bias=True)]
-            + [
-                GraniteSpeechConformerBlock(
-                    config,
-                )
-                for layer_idx in range(config.num_layers)
-            ]
+        self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
+        self.layers = nn.ModuleList(
+            [GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)]
         )
 
         self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
@@ -796,8 +790,8 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.output_dim = config.output_dim
 
     def forward(self, x: torch.Tensor):
-        x = self.rnn_tr[0](x)
-        for idx, layer in enumerate(self.rnn_tr[1:], start=1):
+        x = self.input_linear(x)
+        for idx, layer in enumerate(self.layers, start=1):
             x = layer(x, self.context_size)
             if idx == self.num_layers // 2:
                 x_mid = x.clone()

From 25441007e8072c8f8ceb22348de2eb03faeb364d Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 04:51:22 -0400
Subject: [PATCH 075/116] move pre-norm to the attention/feedforward blocks
 (avoid complex module wrapping)

---
 .../granite_speech/modeling_granite_speech.py | 29 ++-----------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 7fbcadd4366f..3c65de557389 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -813,28 +813,6 @@ def forward(self, x):
 
 
 
-class GraniteSpeechConformerPreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.fn = fn
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x, **kwargs):
-        x = self.norm(x)
-        return self.fn(x, **kwargs)
-
-
-class GraniteSpeechConformerPreNormAttn(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.fn = fn
-        self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x, context_size, **kwargs):
-        x = self.norm(x)
-        return self.fn(x, context_size, **kwargs)
-
-
 class GraniteSpeechConformerAttention(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
@@ -842,6 +820,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         inner_dim = config.dim_head * self.num_heads
         self.dim_head = config.dim_head
         self.scale = self.dim_head**-0.5
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
         self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
         self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, config.hidden_dim)
@@ -890,6 +869,7 @@ def forward(self, x, context_size):
 class GraniteSpeechConformerFeedForward(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
         self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult)
         self.act_fn = nn.SiLU()
         self.dropout = nn.Dropout(config.dropout)
@@ -943,11 +923,6 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.attn = GraniteSpeechConformerAttention(config)
         self.conv = GraniteSpeechConformerConvModule(config)
         self.ff2 = GraniteSpeechConformerFeedForward(config)
-
-        self.attn = GraniteSpeechConformerPreNormAttn(config.hidden_dim, self.attn)
-        self.ff1 = GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff1)
-        self.ff2 = GraniteSpeechConformerPreNorm(config.hidden_dim, self.ff2)
-
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
     def forward(self, hidden_states, context_size):

From 40e2c68d02d3232846443949c6cace4cfe963b08 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 06:04:28 -0400
Subject: [PATCH 076/116] adding pre_norm into forward

---
 .../models/granite_speech/modeling_granite_speech.py            | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 3c65de557389..6274c9fbafd3 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -831,6 +831,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.dropout = nn.Dropout(config.dropout)
 
     def forward(self, x, context_size):
+        x = self.pre_norm(x)
         device, h, max_pos_emb = x.device, self.num_heads, self.max_pos_emb
         bs, n, d = x.shape
         assert context_size > 0 and context_size <= max_pos_emb
@@ -876,6 +877,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim)
 
     def forward(self, x):
+        x = self.pre_norm(x)
         x = self.up_proj(x)
         x = self.dropout(self.act_fn(x))
         x = self.down_proj(x)

From d529e29a68fc6fd21bdf78d5cc8b0339db0da602 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 09:47:23 -0400
Subject: [PATCH 077/116] feature extractor refactoring to resemble how it's
 done in phi4multimodal.

---
 .../feature_extraction_granite_speech.py      | 66 +++++++++++++++++--
 .../processing_granite_speech.py              | 66 ++++---------------
 2 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index ebf145a8e39f..70de5808bfe9 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -16,8 +16,10 @@
 Feature extractor class for Speech Granite
 """
 
+from collections.abc import Sequence
 import math
-from typing import List, Optional
+import numpy as np
+from typing import List, Optional, Union, Tuple
 
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from transformers.utils import is_torch_available, is_torchaudio_available, logging
@@ -31,6 +33,10 @@
 if is_torchaudio_available():
     import torchaudio
 
+# todo: import from transformers.audio_utils
+AudioInput = Union[
+    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
+]
 
 class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
     model_input_names = ["input_features"]
@@ -62,15 +68,31 @@ def __init__(
         self.projector_window_size = projector_window_size
         self.projector_downsample_rate = projector_downsample_rate
 
-    def _ensure_melspec_transform_is_initialized(self):
-        if self.melspec is None:
-            self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
-
     def __call__(
         self,
-        x: torch.Tensor,
+        audios: AudioInput,
         device: Optional[str] = "cpu",
     ) -> BatchFeature:
+        speech_inputs = {}
+        batched_audio, audio_lengths = self._get_validated_audios(audios)
+        # Calculate Mel features & the number of placeholders we will need
+        speech_inputs["input_features"] = self._extract_mel_spectrograms(
+            batched_audio,
+            device=device,
+        )
+        audio_embed_sizes = self._get_num_audio_features(audio_lengths)
+        speech_inputs["audio_embed_sizes"] = audio_embed_sizes
+        # todo: input_features_mask is not a great name, because input_features and input_features mask have different shapes (before/after the projector)
+        speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) <= torch.tensor(
+            audio_embed_sizes
+        ).view(-1, 1)
+        return BatchFeature(data=speech_inputs)
+
+    def _ensure_melspec_transform_is_initialized(self):
+        if self.melspec is None:
+            self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
+
+    def _extract_mel_spectrograms(self, x: torch.Tensor, device="cpu"):
         # TODO there is probably a better way to do both of these things...
         self._ensure_melspec_transform_is_initialized()
         if device is not None:
@@ -114,5 +136,37 @@ def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
 
         return projector_lengths
 
+    def _get_validated_audios(self, audios: AudioInput):
+        # Coerce to PyTorch tensors if we have numpy arrays, since
+        # currently we have a dependency on torch/torchaudio anyway
+        if isinstance(audios, np.ndarray):
+            audios = torch.from_numpy(audios)
+        elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray):
+            audios = [torch.from_numpy(arr) for arr in audios]
+
+        if isinstance(audios, torch.Tensor):
+            if audios.ndim == 1:
+                audios = audios.unsqueeze(0)
+            if not torch.is_floating_point(audios):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
+
+            if audios.shape[0] > 1:
+                logger.warning("Audio samples are already collated; assuming they all have the same length")
+            lengths = [audios.shape[-1]] * audios.shape[0]
+            return audios, lengths
+
+        elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
+            if not torch.is_floating_point(audios[0]):
+                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
+            lengths = [audio.shape[-1] for audio in audios]
+            padding = [max(lengths) - length for length in lengths]
+            # ensure all audios have a batch dimension:
+            audios = [audio.view(1, -1) for audio in audios]
+            padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
+            audios = torch.cat(padded, dim=0)
+            return audios, lengths
+
+        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
+
 
 __all__ = ["GraniteSpeechFeatureExtractor"]
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 1d4c0cfbfe62..d24927d925b5 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -16,10 +16,8 @@
 Processor class for Speech Granite.
 """
 
-from collections.abc import Sequence
 from typing import List, Union
 
-import numpy as np
 import torch
 
 from transformers.feature_extraction_utils import BatchFeature
@@ -32,20 +30,20 @@
 
 
 class GraniteSpeechProcessor(ProcessorMixin):
-    attributes = ["feature_extractor", "tokenizer"]
+    attributes = ["audio_processor", "tokenizer"]
     valid_kwargs = ["audio_token"]
 
-    feature_extractor_class = "GraniteSpeechFeatureExtractor"
+    audio_processor_class = "GraniteSpeechFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
-        feature_extractor,
+        audio_processor,
         tokenizer,
         audio_token="<|audio|>",
     ):
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
-        super().__init__(feature_extractor, tokenizer)
+        super().__init__(audio_processor, tokenizer)
 
     def __call__(
         self,
@@ -54,36 +52,25 @@ def __call__(
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:
-        speech_inputs = {}
-        text_inputs = {}
 
         text = self._get_validated_text(text)
         expected_num_audios = sum(t.count(self.audio_token) for t in text)
 
         if audios is not None:
-            audios, audio_lengths = self._get_validated_audios(audios)
+            audio_inputs = self.audio_processor(audios, device=device)
+            audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
             if any(text.count(self.audio_token) != 1 for text in text):
                 raise ValueError("Only one audio sample is currently supported per input")
-            if len(audio_lengths) != expected_num_audios:
+            if len(audio_embed_sizes) != expected_num_audios:
                 raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
-
-            # Calculate Mel features & the number of placeholders we will need
-            speech_inputs["input_features"] = self.feature_extractor(
-                audios,
-                device=device,
-            )
-            num_audio_features = self.feature_extractor._get_num_audio_features(audio_lengths)
-            speech_inputs["input_features_mask"] = torch.arange(max(num_audio_features)).view(1, -1) <= torch.tensor(
-                num_audio_features
-            ).view(-1, 1)
-
             # duplicate the audio placeholders to match the feature dims
-            text = self._expand_audio_placeholders(text, num_audio_features)
+            processed_text = self._expand_audio_placeholders(text, audio_embed_sizes)
         else:
+            audio_inputs = {}
             assert expected_num_audios == 0, "No audio is provided, expecting no audio tokens"
 
-        text_inputs = self.tokenizer(text, padding=True, **kwargs)
-        return BatchFeature(data={**text_inputs, **speech_inputs})
+        text_inputs = self.tokenizer(processed_text, padding=True, **kwargs)
+        return BatchFeature(data={**text_inputs, **audio_inputs})
 
     def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[int]):
         """
@@ -114,37 +101,6 @@ def _get_validated_text(self, text: Union[str, list]) -> List[str]:
             return text
         raise TypeError("Invalid text provided! Text should be a string or list of strings.")
 
-    def _get_validated_audios(self, audios):
-        # Coerce to PyTorch tensors if we have numpy arrays, since
-        # currently we have a dependency on torch/torchaudio anyway
-        if isinstance(audios, np.ndarray):
-            audios = torch.from_numpy(audios)
-        elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray):
-            audios = [torch.from_numpy(arr) for arr in audios]
-
-        if isinstance(audios, torch.Tensor):
-            if audios.ndim == 1:
-                audios = audios.unsqueeze(0)
-            if not torch.is_floating_point(audios):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
-
-            if audios.shape[0] > 1:
-                logger.warning("Audio samples are already collated; assuming they all have the same length")
-            lengths = [audios.shape[-1]] * audios.shape[0]
-            return audios, lengths
-
-        elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
-            if not torch.is_floating_point(audios[0]):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
-            lengths = [audio.shape[-1] for audio in audios]
-            padding = [max(lengths) - length for length in lengths]
-            # ensure all audios have a batch dimension:
-            audios = [audio.view(1, -1) for audio in audios]
-            padded = [torch.nn.functional.pad(audio, (0, pad)) for audio, pad in zip(audios, padding)]
-            audios = torch.cat(padded, dim=0)
-            return audios, lengths
-
-        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
 
 
 __all__ = ["GraniteSpeechProcessor"]

From e628517132c245b0df7dac879dfbcd58765a64bc Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Thu, 27 Mar 2025 11:35:41 -0400
Subject: [PATCH 078/116] rename feature_extractor to audio_processor

---
 .../test_processor_granite_speech.py          | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index eb16883484cc..141870aec7c0 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -48,7 +48,7 @@ def setUp(self):
     def get_tokenizer(self, **kwargs):
         return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
 
-    def get_feature_extractor(self, **kwargs):
+    def get_audio_processor(self, **kwargs):
         return GraniteSpeechFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
 
     def tearDown(self):
@@ -57,10 +57,10 @@ def tearDown(self):
     def test_save_load_pretrained_default(self):
         """Ensure we can save / reload a processor correctly."""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         processor.save_pretrained(self.tmpdirname)
@@ -69,16 +69,16 @@ def test_save_load_pretrained_default(self):
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
         self.assertIsInstance(processor.tokenizer, GPT2TokenizerFast)
 
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, GraniteSpeechFeatureExtractor)
+        self.assertEqual(processor.audio_processor.to_json_string(), audio_processor.to_json_string())
+        self.assertIsInstance(processor.audio_processor, GraniteSpeechFeatureExtractor)
 
     def test_requires_text(self):
         """Ensure we require text"""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         with pytest.raises(TypeError):
@@ -87,19 +87,19 @@ def test_requires_text(self):
     def test_bad_text_fails(self):
         """Ensure we gracefully fail if text is the wrong type."""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
 
-        processor = GraniteSpeechProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = GraniteSpeechProcessor(tokenizer=tokenizer, audio_processor=audio_processor)
         with pytest.raises(TypeError):
             processor(text=424, audios=None)
 
     def test_bad_nested_text_fails(self):
         """Ensure we gracefully fail if text is the wrong nested type."""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         with pytest.raises(TypeError):
@@ -108,10 +108,10 @@ def test_bad_nested_text_fails(self):
     def test_bad_audios_fails(self):
         """Ensure we gracefully fail if audio is the wrong type."""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         with pytest.raises(TypeError):
@@ -120,10 +120,10 @@ def test_bad_audios_fails(self):
     def test_nested_bad_audios_fails(self):
         """Ensure we gracefully fail if audio is the wrong nested type."""
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         with pytest.raises(TypeError):
@@ -143,10 +143,10 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
         NOTE: Currently we enforce that each sample can only have one audio.
         """
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
         audios = random_func(*vec_dims) - 0.5
 
@@ -157,7 +157,7 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
         audio_token_id = tokenizer.get_vocab()[processor.audio_token]
 
         # Make sure the number of audio tokens matches the number of features
-        num_computed_features = processor.feature_extractor._get_num_audio_features(
+        num_computed_features = processor.audio_processor._get_num_audio_features(
             [vec_dims[1] for _ in range(vec_dims[0])],
         )
         num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
@@ -169,10 +169,10 @@ def test_audio_token_filling_varying_len_feature_list(self):
         multiple varying len audio sequences passed as a list.
         """
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
         vec_dims = [[1, 142100], [1, 269920]]
         num_expected_features = [90, 171]
@@ -191,7 +191,7 @@ def test_audio_token_filling_varying_len_feature_list(self):
         audio_token_id = tokenizer.get_vocab()[processor.audio_token]
 
         # Make sure the number of audio tokens matches the number of features
-        num_calculated_features = processor.feature_extractor._get_num_audio_features(
+        num_calculated_features = processor.audio_processor._get_num_audio_features(
             [dims[1] for dims in vec_dims],
         )
         num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id))
@@ -204,10 +204,10 @@ def test_device_override(self):
         produced are on the CPU.
         """
         tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
+        audio_processor = self.get_audio_processor()
         processor = GraniteSpeechProcessor(
             tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            audio_processor=audio_processor,
         )
 
         vec_dims = [1, 269920]

From 9569d7661a7719a27154e86889d7c5f306562d49 Mon Sep 17 00:00:00 2001
From: Avihu Dekel <avihu.dekel@ibm.com>
Date: Sun, 30 Mar 2025 06:41:30 -0400
Subject: [PATCH 079/116] bugfix: input_feature_mask fix to get the exact
 number tokens.

---
 .../models/granite_speech/feature_extraction_granite_speech.py  | 2 +-
 .../models/granite_speech/modeling_granite_speech.py            | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index 70de5808bfe9..f7be53573e70 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -83,7 +83,7 @@ def __call__(
         audio_embed_sizes = self._get_num_audio_features(audio_lengths)
         speech_inputs["audio_embed_sizes"] = audio_embed_sizes
         # todo: input_features_mask is not a great name, because input_features and input_features mask have different shapes (before/after the projector)
-        speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) <= torch.tensor(
+        speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
             audio_embed_sizes
         ).view(-1, 1)
         return BatchFeature(data=speech_inputs)
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 6274c9fbafd3..32d48ff2b355 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1253,6 +1253,8 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         and potentially labels.
         """
         is_audio_index = input_ids == self.config.audio_token_index
+        assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), \
+            "number of features should align"
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
         inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
 

From f62be3b55f120b0b9246927052bf0d9990b7ea75 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Mon, 31 Mar 2025 07:32:38 +0000
Subject: [PATCH 080/116] Fix pytest decorator in processor test

---
 tests/models/granite_speech/test_processor_granite_speech.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index 141870aec7c0..6d0ab43b3514 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -32,9 +32,8 @@
 if is_torchaudio_available():
     from transformers import GraniteSpeechFeatureExtractor, GraniteSpeechProcessor
 
-pytest.skip("Public models not yet available", allow_module_level=True)
-
 
+@pytest.skip("Public models not yet available", allow_module_level=True)
 @require_torch
 @require_torchaudio
 class GraniteSpeechProcessorTest(unittest.TestCase):

From 6eeaab45ca194fa1182bf0c24e3fa41d5480b97a Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Mon, 31 Mar 2025 07:32:55 +0000
Subject: [PATCH 081/116] Add (disabled) integration tests for granite speech

---
 .../test_modeling_granite_speech.py           | 99 ++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 412e3114c9b5..88fd1ced55a9 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -17,16 +17,24 @@
 import tempfile
 import unittest
 
+import pytest
+
 from transformers import (
+    AutoProcessor,
     GraniteSpeechConfig,
     GraniteSpeechForConditionalGeneration,
-    is_torch_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_torch,
     require_torch_sdpa,
+    slow,
     torch_device,
 )
+from transformers.utils import (
+    is_datasets_available,
+    is_torch_available,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -41,6 +49,9 @@
 if is_torch_available():
     import torch
 
+if is_datasets_available():
+    from datasets import load_dataset
+
 
 class GraniteSpeechForConditionalGenerationModelTester:
     def __init__(
@@ -294,3 +305,89 @@ def test_sdpa_can_dispatch_composite_models(self):
                     class_name = submodule.__class__.__name__
                     if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                         raise ValueError("The eager model should not have SDPA attention layers")
+
+
+class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        # TODO - use the actual model path on HF hub after release.
+        self.model_path = "ibm-granite/granite-speech"
+        self.processor = AutoProcessor.from_pretrained(self.model_path)
+        self.prompt = self._get_prompt(self.processor.tokenizer)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def _get_prompt(self, tokenizer):
+        chat = [
+            {
+                "role": "system",
+                "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant",
+            },
+            {
+                "role": "user",
+                "content": "<|audio|>can you transcribe the speech into a written format?",
+            },
+        ]
+        return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    @slow
+    @pytest.mark.skip("Public models not yet available")
+    def test_small_model_integration_test_single(self):
+        model = GraniteSpeechForConditionalGeneration.from_pretrained(self.model_path).to(torch_device)
+        input_speech = self._load_datasamples(1)
+
+        # Verify feature sizes; note that the feature mask refers to the size of
+        # features that are masked into the LLM, not the output of the processor,
+        # which is why we inspect the mask instead of the `num_features` tensor.
+        inputs = self.processor(self.prompt, input_speech, return_tensors="pt").to(torch_device)
+
+        num_computed_features = self.processor.audio_processor._get_num_audio_features(
+            [speech_arr.shape[-1] for speech_arr in input_speech],
+        )[0]
+        num_actual_features = torch.sum(inputs["input_features_mask"]).item()
+        assert num_actual_features == num_computed_features
+
+        # verify generation
+        output = model.generate(**inputs, max_new_tokens=32)
+        EXPECTED_DECODED_TEXT = "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel"  # fmt: skip
+
+        self.assertEqual(
+            self.processor.tokenizer.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @pytest.mark.skip("Public models not yet available")
+    def test_small_model_integration_test_batch(self):
+        model = GraniteSpeechForConditionalGeneration.from_pretrained(self.model_path)
+        input_speech = self._load_datasamples(2)
+        prompts = [self.prompt, self.prompt]
+
+        # Verify feature sizes & padding
+        inputs = self.processor(prompts, input_speech, return_tensors="pt").to(model.device)
+        num_computed_features = self.processor.audio_processor._get_num_audio_features(
+            [speech_arr.shape[-1] for speech_arr in input_speech],
+        )
+        num_actual_features = torch.sum(inputs["input_features_mask"], dim=-1)
+        for e_feats, a_feats in zip(num_computed_features, num_actual_features):
+            assert e_feats == a_feats.item()
+
+        # verify generation
+        output = model.generate(**inputs, max_new_tokens=32)
+
+        EXPECTED_DECODED_TEXT = [
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mr quilter's manner less interesting than his matter"
+        ]  # fmt: skip
+
+        self.assertEqual(
+            self.processor.tokenizer.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )

From 5ad01a275b55b8e11dd9b17055a53d9037fc381a Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 12:25:55 +0000
Subject: [PATCH 082/116] Fix handling of optional feature masking

---
 .../granite_speech/modeling_granite_speech.py    | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 32d48ff2b355..bf53a8dc0fc5 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1165,7 +1165,9 @@ def forward(
 
             # Merge the audio features into the LLM embeddings
             inputs_embeds = self.get_merged_audio_embeddings(
-                input_ids=input_ids, audio_features=audio_features, input_features_mask=input_features_mask
+                input_ids=input_ids,
+                audio_features=audio_features,
+                input_features_mask=input_features_mask,
             )
 
         outputs = self.language_model(
@@ -1248,19 +1250,19 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         Adds the audio token to the model's LLM vocabulary so that we can pass it
         through the tokenizer; it's assumed that the embeddings corresponding to the
         <|audio|> token will be clobbered with speech features.
-
-        TODO - This needs to be adapted to handle batches of variable length sequences
-        and potentially labels.
         """
         is_audio_index = input_ids == self.config.audio_token_index
-        assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), \
-            "number of features should align"
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
         inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids)  # [bsz, # features, hidden size]
 
         # Mask the audio features into the text embeddings
         special_audio_mask = is_audio_index.unsqueeze(-1)
-        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)[input_features_mask]
+        audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        if input_features_mask is not None:
+            assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), \
+                "number of features should align"
+            audio_features = audio_features[input_features_mask]
+
         inputs_embeds = inputs_embeds.masked_scatter(
             special_audio_mask,
             audio_features,

From fae6307c0d4dc1e5a5f18d80a550b33f54114a62 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 12:26:27 +0000
Subject: [PATCH 083/116] Loosen validation in processing for vLLM
 compatability

---
 .../feature_extraction_granite_speech.py           | 14 ++++++--------
 .../granite_speech/processing_granite_speech.py    | 13 +++++--------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index f7be53573e70..77204f54f6b9 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Feature extractor class for Speech Granite
+Feature extractor class for Granite Speech.
 """
 
 from collections.abc import Sequence
@@ -22,9 +22,9 @@
 from typing import List, Optional, Union, Tuple
 
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from transformers.tokenization_utils_base import AudioInput
 from transformers.utils import is_torch_available, is_torchaudio_available, logging
 
-
 logger = logging.get_logger(__name__)
 
 if is_torch_available():
@@ -33,10 +33,6 @@
 if is_torchaudio_available():
     import torchaudio
 
-# todo: import from transformers.audio_utils
-AudioInput = Union[
-    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
-]
 
 class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
     model_input_names = ["input_features"]
@@ -63,7 +59,7 @@ def __init__(
         # HACK - for now, lazily initialize the mel spectrogram transform;
         # the feature extractor mixin explodes otherwise because
         # it tries to log the feature extractor, and the melspectrogram
-        # transform isn't json serializable...
+        # transform isn't json serializable.
         self.melspec = None
         self.projector_window_size = projector_window_size
         self.projector_downsample_rate = projector_downsample_rate
@@ -82,7 +78,9 @@ def __call__(
         )
         audio_embed_sizes = self._get_num_audio_features(audio_lengths)
         speech_inputs["audio_embed_sizes"] = audio_embed_sizes
-        # todo: input_features_mask is not a great name, because input_features and input_features mask have different shapes (before/after the projector)
+        # TODO: input_features_mask is not a great name, because
+        # input_features and input_features_mask have different shapes
+        # (before/after the projector)
         speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
             audio_embed_sizes
         ).view(-1, 1)
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index d24927d925b5..e5b020f6525f 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Processor class for Speech Granite.
+Processor class for Granite Speech.
 """
 
 from typing import List, Union
@@ -54,20 +54,18 @@ def __call__(
     ) -> BatchFeature:
 
         text = self._get_validated_text(text)
-        expected_num_audios = sum(t.count(self.audio_token) for t in text)
 
         if audios is not None:
+            # NOTE - we intentionally avoid throwing for potentially misaligned
+            # text / audio inputs here because some inference engines will
+            # trigger the conditions due to the way they call multimodal
+            # processors, e.g., vLLM.
             audio_inputs = self.audio_processor(audios, device=device)
             audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
-            if any(text.count(self.audio_token) != 1 for text in text):
-                raise ValueError("Only one audio sample is currently supported per input")
-            if len(audio_embed_sizes) != expected_num_audios:
-                raise ValueError("Text/Audio mismatch. The number of audios and audio tokens do not match")
             # duplicate the audio placeholders to match the feature dims
             processed_text = self._expand_audio_placeholders(text, audio_embed_sizes)
         else:
             audio_inputs = {}
-            assert expected_num_audios == 0, "No audio is provided, expecting no audio tokens"
 
         text_inputs = self.tokenizer(processed_text, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **audio_inputs})
@@ -93,7 +91,6 @@ def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[i
         prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
         return prompt_strings
 
-    ##### Validation
     def _get_validated_text(self, text: Union[str, list]) -> List[str]:
         if isinstance(text, str):
             return [text]

From d18f459f40e9e34cfb5b0161ad32fb1133c8b660 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 12:34:35 +0000
Subject: [PATCH 084/116] Formatting fixes

---
 .../feature_extraction_granite_speech.py       |  6 ++++--
 .../granite_speech/modeling_granite_speech.py  | 18 ++++++++----------
 .../processing_granite_speech.py               |  2 --
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index 77204f54f6b9..ba3a5c1d4bd7 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -16,15 +16,17 @@
 Feature extractor class for Granite Speech.
 """
 
-from collections.abc import Sequence
 import math
+from collections.abc import Sequence
+from typing import List, Optional
+
 import numpy as np
-from typing import List, Optional, Union, Tuple
 
 from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from transformers.tokenization_utils_base import AudioInput
 from transformers.utils import is_torch_available, is_torchaudio_available, logging
 
+
 logger = logging.get_logger(__name__)
 
 if is_torch_available():
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index bf53a8dc0fc5..7e8e16fc6d29 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-from torch import einsum, nn
+from torch import nn
 
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
@@ -777,9 +777,7 @@ class GraniteSpeechCTCModel(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super(GraniteSpeechCTCModel, self).__init__()
         self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
-        self.layers = nn.ModuleList(
-            [GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)]
-        )
+        self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
 
         self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
         self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
@@ -812,7 +810,6 @@ def forward(self, x):
         return self.conv(x)
 
 
-
 class GraniteSpeechConformerAttention(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
@@ -889,14 +886,14 @@ class GraniteSpeechConformerConvModule(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
         inner_dim = config.hidden_dim * config.conv_expansion_factor
-        padding = self.calc_same_padding(config.conv_kernel_size) 
+        padding = self.calc_same_padding(config.conv_kernel_size)
 
         self.norm = nn.LayerNorm(config.hidden_dim)
         self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
         self.glu = nn.GLU(dim=1)
         self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
-                inner_dim, inner_dim, kernel_size=config.conv_kernel_size, padding=padding
-            )
+            inner_dim, inner_dim, kernel_size=config.conv_kernel_size, padding=padding
+        )
         self.silu = nn.SiLU()
         self.batch_norm = nn.BatchNorm1d(inner_dim)
         self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
@@ -1259,8 +1256,9 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         special_audio_mask = is_audio_index.unsqueeze(-1)
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
-            assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), \
-                "number of features should align"
+            assert torch.all(
+                is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)
+            ).item(), "number of features should align"
             audio_features = audio_features[input_features_mask]
 
         inputs_embeds = inputs_embeds.masked_scatter(
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index e5b020f6525f..b65be85df376 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -52,7 +52,6 @@ def __call__(
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:
-
         text = self._get_validated_text(text)
 
         if audios is not None:
@@ -99,5 +98,4 @@ def _get_validated_text(self, text: Union[str, list]) -> List[str]:
         raise TypeError("Invalid text provided! Text should be a string or list of strings.")
 
 
-
 __all__ = ["GraniteSpeechProcessor"]

From 5adc0a9783ad52d3529aa3582188b97c08a9e2a6 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 12:39:29 +0000
Subject: [PATCH 085/116] Update init structure to mirror llama

---
 .../models/granite_speech/__init__.py         | 73 ++-----------------
 1 file changed, 7 insertions(+), 66 deletions(-)

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
index e44510c3e43f..d61225818552 100644
--- a/src/transformers/models/granite_speech/__init__.py
+++ b/src/transformers/models/granite_speech/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2025 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,76 +13,17 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-    is_torchaudio_available,
-)
+from ...utils import _LazyModule
 from ...utils.import_utils import define_import_structure
 
 
-_import_structure = {
-    "configuration_granite_speech": [
-        "GraniteSpeechConfig",
-        "GraniteSpeechEncoderConfig",
-        "GraniteSpeechProjectorConfig",
-    ],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_granite_speech"] = [
-        "GraniteSpeechForConditionalGeneration",
-        "GraniteSpeechPreTrainedModel",
-        "GraniteSpeechEncoderProjectorPreTrainedModel",
-        "GraniteSpeechQFormerModel",
-    ]
-
-try:
-    if not is_torchaudio_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_granite_speech"] = ["GraniteSpeechFeatureExtractor"]
-    _import_structure["processing_granite_speech"] = ["GraniteSpeechProcessor"]
-
-
 if TYPE_CHECKING:
-    from .configuration_granite_speech import (
-        GraniteSpeechConfig,
-        GraniteSpeechEncoderConfig,
-        GraniteSpeechProjectorConfig,
-    )
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_granite_speech import (
-            GraniteSpeechEncoderProjectorPreTrainedModel,
-            GraniteSpeechForConditionalGeneration,
-            GraniteSpeechPreTrainedModel,
-            GraniteSpeechQFormerModel,
-        )
-
-    try:
-        if not is_torchaudio_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_granite_speech import GraniteSpeechFeatureExtractor
-        from .processing_granite_speech import GraniteSpeechProcessor
+    from .configuration_granite_speech import *
+    from .feature_extraction_granite_speech import *
+    from .modeling_granite_speech import *
+    from .processing_granite_speech import *
 else:
     import sys
 
     _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, _import_structure, module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

From e7f7af6de69f2c21b16b09c15905088ae6b93267 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 16:04:41 +0000
Subject: [PATCH 086/116] Make granite speech projector generic

---
 docs/source/en/model_doc/granite_speech.md    |  11 -
 src/transformers/__init__.py                  |  45 --
 .../configuration_granite_speech.py           |  63 +-
 .../granite_speech/modeling_granite_speech.py | 698 +-----------------
 utils/check_repo.py                           |   1 -
 5 files changed, 25 insertions(+), 793 deletions(-)

diff --git a/docs/source/en/model_doc/granite_speech.md b/docs/source/en/model_doc/granite_speech.md
index ae5f85f18276..3480b379fbfb 100644
--- a/docs/source/en/model_doc/granite_speech.md
+++ b/docs/source/en/model_doc/granite_speech.md
@@ -33,11 +33,6 @@ Currently being updated!
 [[autodoc]] GraniteSpeechEncoderConfig
 
 
-## GraniteSpeechProjectorConfig
-
-[[autodoc]] GraniteSpeechProjectorConfig
-
-
 ## GraniteSpeechProcessor
 
 [[autodoc]] GraniteSpeechProcessor
@@ -48,12 +43,6 @@ Currently being updated!
 [[autodoc]] GraniteSpeechFeatureExtractor
 
 
-## GraniteSpeechQFormerModel
-
-[[autodoc]] GraniteSpeechQFormerModel
-    - forward
-
-
 ## GraniteSpeechForConditionalGeneration
 
 [[autodoc]] GraniteSpeechForConditionalGeneration
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d4709337c9ec..2d054920196c 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -530,51 +530,6 @@
     _import_structure["tf_utils"] = []
 
 
-<<<<<<< HEAD
-=======
-try:
-    if not (
-        is_librosa_available()
-        and is_essentia_available()
-        and is_scipy_available()
-        and is_torch_available()
-        and is_pretty_midi_available()
-    ):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import (
-        dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects,
-    )
-
-    _import_structure["utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects"] = [
-        name
-        for name in dir(dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects)
-        if not name.startswith("_")
-    ]
-else:
-    _import_structure["models.pop2piano"].append("Pop2PianoFeatureExtractor")
-    _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
-    _import_structure["models.pop2piano"].append("Pop2PianoProcessor")
-
-try:
-    if not is_torchaudio_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import (
-        dummy_torchaudio_objects,
-    )
-
-    _import_structure["utils.dummy_torchaudio_objects"] = [
-        name for name in dir(dummy_torchaudio_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["models.granite_speech"].append("GraniteSpeechFeatureExtractor")
-    _import_structure["models.granite_speech"].append("GraniteSpeechProcessor")
-
-    _import_structure["models.musicgen_melody"].append("MusicgenMelodyFeatureExtractor")
-    _import_structure["models.musicgen_melody"].append("MusicgenMelodyProcessor")
-
->>>>>>> 1a8b4a742 (Import sorting)
 # FLAX-backed objects
 try:
     if not is_flax_available():
diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 9aa1d2aad65d..f2037d5c52e6 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -34,58 +34,12 @@ def __init__(
         self.conv_expansion_factor = conv_expansion_factor
 
 
-## adapted from transformers.models.blip.configuration_blip_2.Blip2VisionConfig
-class GraniteSpeechProjectorConfig(PretrainedConfig):
-    model_type = "granite_speech_qformer"
-
-    def __init__(
-        self,
-        llm_dim=4096,
-        downsample_rate=5,
-        window_size=15,
-        hidden_size=1024,
-        num_attention_heads=16,
-        intermediate_size=4096,
-        num_hidden_layers=2,
-        encoder_hidden_size=1024,
-        cross_attention_frequency=1,
-        max_position_embeddings=2048,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        position_embedding_type="absolute",
-        use_qformer_text_input=False,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
-        self.cross_attention_frequency = cross_attention_frequency
-        self.encoder_hidden_size = encoder_hidden_size
-        self.use_qformer_text_input = use_qformer_text_input
-        self.downsample_rate = downsample_rate
-        self.window_size = window_size
-        self.llm_dim = llm_dim
-
-
 class GraniteSpeechConfig(PretrainedConfig):
     model_type = "granite_speech"
     sub_configs = {
         "text_config": AutoConfig,
         "encoder_config": GraniteSpeechEncoderConfig,
-        "projector_config": GraniteSpeechProjectorConfig,
+        "projector_config": AutoConfig,
     }
 
     def __init__(
@@ -96,6 +50,9 @@ def __init__(
         audio_token_index=49155,
         initializer_range=0.02,
         has_lora_adapter=True,
+        # Extra projector stuff
+        downsample_rate=5,
+        window_size=15,
         **kwargs,
     ):
         if isinstance(text_config, dict):
@@ -105,10 +62,12 @@ def __init__(
             text_config = CONFIG_MAPPING["granite"]()
 
         if isinstance(projector_config, dict):
-            # TODO - In the future, we should make this generic.
-            projector_config = GraniteSpeechProjectorConfig(**projector_config)
+            projector_config["model_type"] = (
+                projector_config["model_type"] if "model_type" in projector_config else "blip_2_qformer"
+            )
+            projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config)
         elif projector_config is None:
-            projector_config = GraniteSpeechProjectorConfig()
+            projector_config = CONFIG_MAPPING["blip_2_qformer"]()
 
         if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
             encoder_config = {} if encoder_config is None else encoder_config
@@ -120,7 +79,9 @@ def __init__(
         self.audio_token_index = audio_token_index
         self.initializer_range = initializer_range
         self.has_lora_adapter = has_lora_adapter
+        self.downsample_rate = downsample_rate
+        self.window_size = window_size
         super().__init__(**kwargs)
 
 
-__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechProjectorConfig", "GraniteSpeechConfig"]
+__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechConfig"]
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 7e8e16fc6d29..f231fa8d7a60 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -7,16 +7,12 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
     ModelOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto import AutoModelForCausalLM
-from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -28,7 +24,6 @@
 from .configuration_granite_speech import (
     GraniteSpeechConfig,
     GraniteSpeechEncoderConfig,
-    GraniteSpeechProjectorConfig,
 )
 
 
@@ -73,685 +68,20 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-### Projector
-# Currently, we copy the Qformer code directly to avoid depending on Blip2;
-# it would be better to create the model from config, similar to the LLM,
-# but to do this, we will need to register the QFormer model into an automodel,
-# which will should involve pulling it out into its own dir so that it is accessible
-# under transformers.models.X.
-
-
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->GraniteSpeech
-class GraniteSpeechQFormerMultiHeadAttention(nn.Module):
-    def __init__(self, config, is_cross_attention=False):
-        super().__init__()
-        self.config = config
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
-                % (config.hidden_size, config.num_attention_heads)
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        if is_cross_attention:
-            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
-            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
-        else:
-            self.key = nn.Linear(config.hidden_size, self.all_head_size)
-            self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-        self.save_attention = False
-
-    def save_attn_gradients(self, attn_gradients):
-        self.attn_gradients = attn_gradients
-
-    def get_attn_gradients(self):
-        return self.attn_gradients
-
-    def save_attention_map(self, attention_map):
-        self.attention_map = attention_map
-
-    def get_attention_map(self):
-        return self.attention_map
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        mixed_query_layer = self.query(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        if is_cross_attention and self.save_attention:
-            self.save_attention_map(attention_probs)
-            attention_probs.register_hook(self.save_attn_gradients)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs_dropped = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs_dropped = attention_probs_dropped * head_mask
-
-        context_layer = torch.matmul(attention_probs_dropped, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        outputs = outputs + (past_key_value,)
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->GraniteSpeechQFormer
-class GraniteSpeechQFormerSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->GraniteSpeech
-class GraniteSpeechQFormerAttention(nn.Module):
-    def __init__(self, config, is_cross_attention=False):
-        super().__init__()
-        self.attention = GraniteSpeechQFormerMultiHeadAttention(config, is_cross_attention)
-        self.output = GraniteSpeechQFormerSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.attention.query = prune_linear_layer(self.attention.query, index)
-        self.attention.key = prune_linear_layer(self.attention.key, index)
-        self.attention.value = prune_linear_layer(self.attention.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
-        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
-        self_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->GraniteSpeechQFormer
-class GraniteSpeechQFormerIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->GraniteSpeechQFormer
-class GraniteSpeechQFormerOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerLayer with Blip2->GraniteSpeech
-class GraniteSpeechQFormerLayer(nn.Module):
-    def __init__(self, config, layer_idx):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = GraniteSpeechQFormerAttention(config)
-
-        self.layer_idx = layer_idx
-
-        if layer_idx % config.cross_attention_frequency == 0:
-            self.crossattention = GraniteSpeechQFormerAttention(config, is_cross_attention=True)
-            self.has_cross_attention = True
-        else:
-            self.has_cross_attention = False
-
-        if config.use_qformer_text_input:
-            self.intermediate = GraniteSpeechQFormerIntermediate(config)
-            self.output = GraniteSpeechQFormerOutput(config)
-
-        self.intermediate_query = GraniteSpeechQFormerIntermediate(config)
-        self.output_query = GraniteSpeechQFormerOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-        query_length=0,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:-1]
-
-        present_key_value = self_attention_outputs[-1]
-
-        if query_length > 0:
-            query_attention_output = attention_output[:, :query_length, :]
-
-            if self.has_cross_attention:
-                if encoder_hidden_states is None:
-                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
-                cross_attention_outputs = self.crossattention(
-                    query_attention_output,
-                    attention_mask,
-                    head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    output_attentions=output_attentions,
-                )
-                query_attention_output = cross_attention_outputs[0]
-                # add cross attentions if we output attention weights
-                outputs = outputs + cross_attention_outputs[1:-1]
-
-            layer_output = apply_chunking_to_forward(
-                self.feed_forward_chunk_query,
-                self.chunk_size_feed_forward,
-                self.seq_len_dim,
-                query_attention_output,
-            )
-
-            if attention_output.shape[1] > query_length:
-                layer_output_text = apply_chunking_to_forward(
-                    self.feed_forward_chunk,
-                    self.chunk_size_feed_forward,
-                    self.seq_len_dim,
-                    attention_output[:, query_length:, :],
-                )
-                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
-        else:
-            layer_output = apply_chunking_to_forward(
-                self.feed_forward_chunk,
-                self.chunk_size_feed_forward,
-                self.seq_len_dim,
-                attention_output,
-            )
-        outputs = (layer_output,) + outputs
-
-        outputs = outputs + (present_key_value,)
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-    def feed_forward_chunk_query(self, attention_output):
-        intermediate_output = self.intermediate_query(attention_output)
-        layer_output = self.output_query(intermediate_output, attention_output)
-        return layer_output
-
-
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->GraniteSpeech
-class GraniteSpeechQFormerEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [GraniteSpeechQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-        query_length=0,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions else None
-
-        next_decoder_cache = () if use_cache else None
-
-        for i in range(self.config.num_hidden_layers):
-            layer_module = self.layer[i]
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-
-            if getattr(self.config, "gradient_checkpointing", False) and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer_module.__call__,
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                    query_length,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if layer_module.has_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class GraniteSpeechEncoderProjectorPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = GraniteSpeechProjectorConfig
-    base_model_prefix = "qformer"
-    supports_gradient_checkpointing = True
-
-    _no_split_modules = [
-        "GraniteSpeechQFormerMultiHeadAttention",
-        "T5Block",
-        "OPTDecoderLayer",
-    ]
-    _skip_keys_device_placement = "past_key_values"
-    _keep_in_fp32_modules = ["query_tokens"]
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        factor = self.config.initializer_range
-        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=factor)
-            if hasattr(module, "bias") and module.bias is not None:
-                module.bias.data.zero_()
-
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-class GraniteSpeechQFormerModel(GraniteSpeechEncoderProjectorPreTrainedModel):
-    """
-    Querying Transformer (Q-Former), used in GraniteSpeech.
-    """
-
-    def __init__(self, config: GraniteSpeechProjectorConfig):
-        super().__init__(config)
-        self.config = config
-
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-        self.encoder = GraniteSpeechQFormerEncoder(config)
-
-        self.post_init()
-
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.get_input_embeddings
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.set_input_embeddings
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.get_extended_attention_mask
-    def get_extended_attention_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_shape: Tuple[int],
-        device: torch.device,
-        has_query: bool = False,
-    ) -> torch.Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-
-        Arguments:
-            attention_mask (`torch.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (`Tuple[int]`):
-                The shape of the input to the model.
-            device (`torch.device`):
-                The device of the input to the model.
-
-        Returns:
-            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
-        """
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        return extended_attention_mask
-
-    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerModel.forward
-    def forward(
-        self,
-        query_embeds: torch.FloatTensor,
-        query_length: Optional[int] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
-            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
-            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
-            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
-            `(batch_size, sequence_length)`.
-        use_cache (`bool`, `optional`):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # past_key_values_length
-        past_key_values_length = (
-            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
-        )
-
-        query_length = (
-            query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0
-        )
-
-        embedding_output = self.layernorm(query_embeds)
-        embedding_output = self.dropout(embedding_output)
-
-        input_shape = embedding_output.size()[:-1]
-        batch_size, seq_length = input_shape
-        device = embedding_output.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_hidden_states is not None:
-            if isinstance(encoder_hidden_states, list):
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
-            else:
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-
-            if isinstance(encoder_attention_mask, list):
-                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
-            elif encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-            else:
-                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            query_length=query_length,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = sequence_output[:, 0, :]
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-
-
-# TODO (alex) - refactor GraniteSpeechQformer to be available under
-# transformers.models.X, delete all of the code above, and
-# create the model through AutoModel.
-
-
 class GraniteSpeechEncoderProjectorQFormer(nn.Module):
-    def __init__(self, config: GraniteSpeechProjectorConfig):
+    def __init__(self, config: GraniteSpeechConfig):
         super().__init__()
-        self.hidden_size = config.hidden_size
-        self.ds_rate = config.downsample_rate
+        self.hidden_size = config.projector_config.hidden_size
+        self.downsample_rate = config.downsample_rate
         self.window_size = config.window_size
-        self.num_queries = self.window_size // self.ds_rate
-        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.hidden_size))
+        self.num_queries = config.window_size // config.downsample_rate
+
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.projector_config.hidden_size))
         self.query.data.normal_(mean=0.0, std=1.0)
-        # NOTE: It would be better to create this from config, similar to the LLM.
-        # To do this, we need to register the QFormer model into an automodel, which
-        # will require pulling it out into its own dir so that it's accessible under
-        # transformers.models.X
-        self.qformer = GraniteSpeechQFormerModel(config)
-        self.linear = nn.Linear(config.hidden_size, config.llm_dim)
+        # Generally, this will be create the model for blip_2_qformer,
+        # but we write it flexibly to allowed other projectors here as needed.
+        self.qformer = AutoModel.from_config(config.projector_config)
+        self.linear = nn.Linear(config.projector_config.hidden_size, config.text_config.hidden_size)
 
     def forward(self, x, atts):
         batch_size, seq_len, dim = x.size()
@@ -767,7 +97,7 @@ def forward(self, x, atts):
             return_dict=True,
         )
         query_proj = self.linear(
-            query_output.last_hidden_state.view(batch_size, nblocks * self.window_size // self.ds_rate, -1)
+            query_output.last_hidden_state.view(batch_size, nblocks * self.window_size // self.downsample_rate, -1)
         )
         return query_proj
 
@@ -1062,7 +392,7 @@ def __init__(self, config: GraniteSpeechConfig):
             self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
         self.encoder = GraniteSpeechCTCModel(config.encoder_config)
-        self.projector = GraniteSpeechEncoderProjectorQFormer(config.projector_config)
+        self.projector = GraniteSpeechEncoderProjectorQFormer(config)
 
         if config.has_lora_adapter and not is_peft_available():
             logger.warning(
@@ -1297,6 +627,4 @@ def save_pretrained(self, *args, **kwargs):
 __all__ = [
     "GraniteSpeechForConditionalGeneration",
     "GraniteSpeechPreTrainedModel",
-    "GraniteSpeechEncoderProjectorPreTrainedModel",
-    "GraniteSpeechQFormerModel",
 ]
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 55a63112c482..0338b84ee695 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -110,7 +110,6 @@
         "FastSpeech2ConformerHifiGan",  # Already tested by SpeechT5HifiGan (# Copied from)
         "FastSpeech2ConformerWithHifiGan",  # Built with two smaller (tested) models.
         "GraphormerDecoderHead",  # Building part of bigger (tested) model.
-        "GraniteSpeechQFormerModel",  # Building part of bigger (tested) model.
         "JukeboxVQVAE",  # Building part of bigger (tested) model.
         "JukeboxPrior",  # Building part of bigger (tested) model.
         "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.

From 3725b04e1c14b35e79a6838083afba04ee3d52c9 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 3 Apr 2025 16:12:42 +0000
Subject: [PATCH 087/116] Update test config to reflect generic projector

---
 .../granite_speech/test_modeling_granite_speech.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 88fd1ced55a9..ec19382379f9 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -97,7 +97,6 @@ def __init__(
         projector_config={
             "attention_probs_dropout_prob": 0.1,
             "cross_attention_frequency": 1,
-            "downsample_rate": 5,
             "encoder_hidden_size": 32,
             "hidden_act": "gelu",
             "hidden_dropout_prob": 0.1,
@@ -105,24 +104,23 @@ def __init__(
             "initializer_range": 0.02,
             "intermediate_size": 256,
             "layer_norm_eps": 1e-12,
-            "llm_dim": 32,
             "max_position_embeddings": 2048,
-            "model_type": "granite_speech_qformer",
+            "model_type": "blip_2_qformer",
             "num_attention_heads": 4,
             "num_hidden_layers": 2,
             "position_embedding_type": "absolute",
             "use_qformer_text_input": False,
             "vocab_size": 30522,
-            "window_size": 15,
         },
         audio_token_index=0,
         tie_word_embeddings=True,
         initializer_range=0.02,
         has_lora_adapter=True,
+        downsample_rate=5,
+        window_size=15,
         is_training=True,
     ):
         self.parent = parent
-        self.projector_config = None
         self.encoder_config = encoder_config
         self.text_config = text_config
         self.projector_config = projector_config
@@ -130,6 +128,8 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
         self.initializer_range = initializer_range
         self.has_lora_adapater = has_lora_adapter
+        self.downsample_rate = downsample_rate
+        self.window_size = window_size
         self.is_training = is_training
 
         # Dims for audio features

From a5216fbb1b1ab1ddc548c08a5dd60bf388ad8e7f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 05:54:29 +0000
Subject: [PATCH 088/116] Formatting fixes

---
 .../models/granite_speech/modeling_granite_speech.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index f231fa8d7a60..5dff2b181238 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -586,9 +586,9 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         special_audio_mask = is_audio_index.unsqueeze(-1)
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
-            assert torch.all(
-                is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)
-            ).item(), "number of features should align"
+            assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), (
+                "number of features should align"
+            )
             audio_features = audio_features[input_features_mask]
 
         inputs_embeds = inputs_embeds.masked_scatter(

From ff5869f1dd53aa55ba7d2b8b5c9cd6563fd43c7f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 06:06:55 +0000
Subject: [PATCH 089/116] Fix typos, add license

---
 .../configuration_granite_speech.py           | 16 ++++++++++++++++
 .../feature_extraction_granite_speech.py      |  4 +---
 .../granite_speech/modeling_granite_speech.py | 19 +++++++++++++++++--
 .../processing_granite_speech.py              |  4 +---
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index f2037d5c52e6..f2d7a8287ce0 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config class for Granite Speech."""
+
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import CONFIG_MAPPING, AutoConfig
 
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index ba3a5c1d4bd7..fbdf6e568002 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Granite Speech.
-"""
+"""Feature extractor class for Granite Speech."""
 
 import math
 from collections.abc import Sequence
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 5dff2b181238..04b789d0f0b0 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -128,7 +143,7 @@ def forward(self, x: torch.Tensor):
         return x
 
 
-# NOTE: Conformer adapated from: https://github.com/lucidrains/conformer.git
+# NOTE: Conformer adapted from: https://github.com/lucidrains/conformer.git
 class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     def __init__(self, chan_in, chan_out, kernel_size, padding):
         super().__init__()
@@ -598,7 +613,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         return inputs_embeds
 
     def generate(self, *args, **kwargs):
-        """This model is expected to have a lora adapater, which is only
+        """This model is expected to have a lora adapter, which is only
         enabled when considering audio inputs. As such, we override generate
         to conditionally enable / disable the lora adapter based on whether
         or not any input features were provided.
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index b65be85df376..7b361465bcb7 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Processor class for Granite Speech.
-"""
+"""Processor class for Granite Speech."""
 
 from typing import List, Union
 

From edfdfbe380525a7f080c02455c9d1a82467ed818 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 06:26:56 +0000
Subject: [PATCH 090/116] Fix undefined var in input processing

---
 .../processing_granite_speech.py              | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 7b361465bcb7..3a68cc39137c 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -51,6 +51,7 @@ def __call__(
         **kwargs,
     ) -> BatchFeature:
         text = self._get_validated_text(text)
+        prompt_strings = text
 
         if audios is not None:
             # NOTE - we intentionally avoid throwing for potentially misaligned
@@ -59,35 +60,28 @@ def __call__(
             # processors, e.g., vLLM.
             audio_inputs = self.audio_processor(audios, device=device)
             audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
-            # duplicate the audio placeholders to match the feature dims
-            processed_text = self._expand_audio_placeholders(text, audio_embed_sizes)
+
+            # Expand the audio placeholders to match the feature dims; this
+            # is similar to how many VLMs handle image tokens, e.g., llava next
+            prompt_strings = []
+            num_replaced = 0
+            for sample in text:
+                while self.audio_token in sample:
+                    sample = sample.replace(
+                        self.audio_token,
+                        "<placeholder>" * audio_embed_sizes[num_replaced],
+                        1,
+                    )
+                    num_replaced += 1
+                prompt_strings.append(sample)
+
+            prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
         else:
             audio_inputs = {}
 
-        text_inputs = self.tokenizer(processed_text, padding=True, **kwargs)
+        text_inputs = self.tokenizer(prompt_strings, padding=True, **kwargs)
         return BatchFeature(data={**text_inputs, **audio_inputs})
 
-    def _expand_audio_placeholders(self, text: list[str], num_audio_features: List[int]):
-        """
-        Expands audio placeholders in the formatted text to match the number of
-        features of the corresponding embeddings; we can use the resulting text
-        to conveniently mask the audio features into the text embeddings.
-        """
-        prompt_strings = []
-        num_replaced = 0
-        for sample in text:
-            while self.audio_token in sample:
-                sample = sample.replace(
-                    self.audio_token,
-                    "<placeholder>" * num_audio_features[num_replaced],
-                    1,
-                )
-                num_replaced += 1
-            prompt_strings.append(sample)
-
-        prompt_strings = [sample.replace("<placeholder>", self.audio_token) for sample in prompt_strings]
-        return prompt_strings
-
     def _get_validated_text(self, text: Union[str, list]) -> List[str]:
         if isinstance(text, str):
             return [text]

From de0bf8b883e8209ad663239b86ac9721a3dc208a Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 07:58:46 +0000
Subject: [PATCH 091/116] Cleanup and expose ctc encoder

---
 .../granite_speech/modeling_granite_speech.py | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 04b789d0f0b0..da655cef7c0b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -118,29 +118,27 @@ def forward(self, x, atts):
 
 
 ### Encoder
-class GraniteSpeechCTCModel(nn.Module):
+class GraniteSpeechCTCEncoder(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
-        super(GraniteSpeechCTCModel, self).__init__()
+        super().__init__()
+        self.config = config
         self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
         self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
 
         self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
         self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
         self.context_size = config.context_size
-        self.input_dim = config.input_dim
         self.num_layers = config.num_layers
-        self.hidden_dim = config.hidden_dim
-        self.output_dim = config.output_dim
 
-    def forward(self, x: torch.Tensor):
-        x = self.input_linear(x)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
-            x = layer(x, self.context_size)
+            hidden_states = layer(hidden_states, self.context_size)
             if idx == self.num_layers // 2:
-                x_mid = x.clone()
-                x_mid = self.out(x_mid)
-                x += self.out_mid(nn.Softmax(dim=-1)(x_mid))
-        return x
+                hidden_states_mid = hidden_states.clone()
+                hidden_states_mid = self.out(hidden_states_mid)
+                hidden_states += self.out_mid(nn.Softmax(dim=-1)(hidden_states_mid))
+        return hidden_states
 
 
 # NOTE: Conformer adapted from: https://github.com/lucidrains/conformer.git
@@ -406,7 +404,7 @@ def __init__(self, config: GraniteSpeechConfig):
         if self.language_model._tied_weights_keys is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
-        self.encoder = GraniteSpeechCTCModel(config.encoder_config)
+        self.encoder = GraniteSpeechCTCEncoder(config.encoder_config)
         self.projector = GraniteSpeechEncoderProjectorQFormer(config)
 
         if config.has_lora_adapter and not is_peft_available():
@@ -612,7 +610,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         )
         return inputs_embeds
 
-    def generate(self, *args, **kwargs):
+    def generate(self, *args, **kwargs) -> torch.LongTensor:
         """This model is expected to have a lora adapter, which is only
         enabled when considering audio inputs. As such, we override generate
         to conditionally enable / disable the lora adapter based on whether
@@ -640,6 +638,7 @@ def save_pretrained(self, *args, **kwargs):
 
 
 __all__ = [
+    "GraniteSpeechCTCEncoder",
     "GraniteSpeechForConditionalGeneration",
     "GraniteSpeechPreTrainedModel",
 ]

From 1db4fd8b4361c46d91ce2099bb5ed251f6d06549 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 08:00:41 +0000
Subject: [PATCH 092/116] Add missing config docstrings

---
 .../configuration_granite_speech.py           | 94 ++++++++++++++++++-
 1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index f2d7a8287ce0..5f4b2b43b2bb 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -19,6 +19,56 @@
 
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechCTCEncoder`]. It is used to instantiate
+    a Granite Speech audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the dfefaults will yield a similar configuration to that of the audio encoder of the Granite Speech
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        input_dim (`int`, *optional*, defaults to 160):
+            Dimension of the first hidden layer of the encoder.
+        num_layers (`int`, *optional*, defaults to 10):
+            Number of encoder blocks.
+        hidden_dim (`int`, *optional*, defaults to 1024):
+            The size of the intermediate layers in the conformer encoder.
+        feedforward_mult (`int`, *optional*, defaults to 4):
+            Multiplier for the up/down projections in the encoder's feedforward layers;
+            The projections will have intermediate dim of size `hidden_dim * feedforward_mult`.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim_head (`int`, *optional*, defaults to 128):
+            Dimension of attention heads for each attention layer in the Transformer encoder.
+        output_dim (`int`, *optional*, defaults to 42):
+            Intermediate dimension of the feedforward projections in the conformer
+            to be added to every other encoder block's output.
+        context_size (`int`, *optional*, defaults to 200):
+            Context size to be used in conformer attention.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for fully connected layers in the encoder.
+        conv_kernel_size (`int`, *optional*, defaults to 15):
+            Kernel size to be used for 1D convolution in each conformer block.
+        conv_expansion_factor (`int`, *optional*, defaults to 2):
+            Intermediate dimension to be used in conformer convolutions.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechEncoderConfig, GraniteSpeechCTCEncoder
+
+    >>> # Initializing a GraniteSpeechEncoderConfig
+    >>> configuration = GraniteSpeechEncoderConfig()
+
+    >>> # Initializing a GraniteSpeechCTCEncoder (with random weights)
+    >>> model = GraniteSpeechCTCEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
     model_type = "granite_speech_encoder"
 
     def __init__(
@@ -51,6 +101,47 @@ def __init__(
 
 
 class GraniteSpeechConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechForConditionalGeneration`]. It is used to instantiate an
+    Granite Speech model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `GraniteConfig`):
+            The config object or dictionary of the text backbone.
+        encoder_config (`GraniteSpeechEncoderConfig`, *optional*):
+            The config object or dictionary of the Granite Speech CTC Encoder.
+        projector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Blip2QFormerConfig`):
+            The config object or dictionary of the audio projector.
+        audio_token_index (`int`, *optional*, defaults to 49155):
+            The audio token index to encode the audio prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        has_lora_adapter (`bool`, *optional*, defaults to `True`):
+            Indicates whether or not the model has a lora adapter that should only
+            be activate when processing audio inputs.
+        downsample_rate (`int`, *optional*, defaults to 5):
+            Downsample rate for the audio feature extractor.
+        window_size (`int`, *optional*, defaults to 15):
+            Window size for the audio feature projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechConfig, GraniteSpeechForConditionalGeneration
+
+    >>> # Initializing a GraniteSpeechConfig
+    >>> configuration = GraniteSpeechConfig()
+
+    >>> # Initializing a GraniteSpeechForConditionalGeneration (with random weights)
+    >>> model = GraniteSpeechForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
     model_type = "granite_speech"
     sub_configs = {
         "text_config": AutoConfig,
@@ -60,13 +151,12 @@ class GraniteSpeechConfig(PretrainedConfig):
 
     def __init__(
         self,
-        encoder_config=None,
         text_config=None,
+        encoder_config=None,
         projector_config=None,
         audio_token_index=49155,
         initializer_range=0.02,
         has_lora_adapter=True,
-        # Extra projector stuff
         downsample_rate=5,
         window_size=15,
         **kwargs,

From e078bb99e28981bc6a4c38c999641d695cc7df46 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 09:02:17 +0000
Subject: [PATCH 093/116] Better var names, type hints, etc

---
 .../granite_speech/modeling_granite_speech.py | 229 ++++++++++--------
 1 file changed, 128 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index da655cef7c0b..5525100f3ed9 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -83,7 +83,8 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-class GraniteSpeechEncoderProjectorQFormer(nn.Module):
+### Projector
+class GraniteSpeechEncoderProjector(nn.Module):
     def __init__(self, config: GraniteSpeechConfig):
         super().__init__()
         self.hidden_size = config.projector_config.hidden_size
@@ -93,22 +94,22 @@ def __init__(self, config: GraniteSpeechConfig):
 
         self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.projector_config.hidden_size))
         self.query.data.normal_(mean=0.0, std=1.0)
-        # Generally, this will be create the model for blip_2_qformer,
-        # but we write it flexibly to allowed other projectors here as needed.
+
+        # By default, this will be a blip_2_qformer config
         self.qformer = AutoModel.from_config(config.projector_config)
         self.linear = nn.Linear(config.projector_config.hidden_size, config.text_config.hidden_size)
 
-    def forward(self, x, atts):
-        batch_size, seq_len, dim = x.size()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.size()
         nblocks = math.ceil(seq_len / self.window_size)
         pad = nblocks * self.window_size - seq_len
-        x = nn.functional.pad(x, (0, 0, 0, pad), "constant", 0)
-        x = x.view(batch_size * nblocks, self.window_size, dim)
+        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
+        hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim)
 
         query_output = self.qformer(
             query_embeds=self.query.data,
-            encoder_hidden_states=x,
-            encoder_attention_mask=atts,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=None,
             return_dict=True,
         )
         query_proj = self.linear(
@@ -117,7 +118,7 @@ def forward(self, x, atts):
         return query_proj
 
 
-### Encoder
+### Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git
 class GraniteSpeechCTCEncoder(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
@@ -141,91 +142,111 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-# NOTE: Conformer adapted from: https://github.com/lucidrains/conformer.git
-class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
-    def __init__(self, chan_in, chan_out, kernel_size, padding):
+class GraniteSpeechConformerBlock(nn.Module):
+    """Conformer block, consisting largely of linear layers, attention, and convolutional layers."""
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        self.padding = padding
-        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
+        self.ff1 = GraniteSpeechConformerFeedForward(config)
+        self.attn = GraniteSpeechConformerAttention(config)
+        self.conv = GraniteSpeechConformerConvModule(config)
+        self.ff2 = GraniteSpeechConformerFeedForward(config)
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor, context_size: int) -> torch.Tensor:
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(hidden_states, context_size) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
 
-    def forward(self, x):
-        x = F.pad(x, self.padding)
-        return self.conv(x)
+
+class GraniteSpeechConformerFeedForward(nn.Module):
+    """Feedforward module for conformer encoder blocks."""
+
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+        self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult)
+        self.silu = nn.SiLU()
+        self.dropout = nn.Dropout(config.dropout)
+        self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states = self.up_proj(hidden_states)
+        hidden_states = self.dropout(self.silu(hidden_states))
+        hidden_states = self.down_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
 
 
 class GraniteSpeechConformerAttention(nn.Module):
+    """Attention for conformer blocks with shaw's relpos embeddings."""
+
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
+
+        inner_dim = config.dim_head * config.num_heads
+        self.max_pos_emb = 512
         self.num_heads = config.num_heads
-        inner_dim = config.dim_head * self.num_heads
         self.dim_head = config.dim_head
         self.scale = self.dim_head**-0.5
         self.pre_norm = nn.LayerNorm(config.hidden_dim)
         self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
         self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
         self.to_out = nn.Linear(inner_dim, config.hidden_dim)
-
-        self.max_pos_emb = 512
         self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
-
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, x, context_size):
-        x = self.pre_norm(x)
-        device, h, max_pos_emb = x.device, self.num_heads, self.max_pos_emb
-        bs, n, d = x.shape
-        assert context_size > 0 and context_size <= max_pos_emb
+    def forward(self, hidden_states: torch.Tensor, context_size: int) -> torch.Tensor:
+        if context_size <= 0 or context_size > self.max_pos_emb:
+            raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb")
+
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
 
-        nb = math.ceil(n / context_size)
-        nr = n % context_size
-        if nr > 0:
+        num_blocks = math.ceil(num_features / context_size)
+        remainder = num_features % context_size
+        if remainder > 0:
             # right padding to reach block size
-            x = torch.nn.functional.pad(x, (0, 0, 0, context_size - nr))
+            hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, context_size - remainder))
 
-        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim=-1))
-        q, k, v = [t.reshape(bs, nb, context_size, h, -1).transpose(2, 3) for t in (q, k, v)]
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+        query_states, key_states, value_states = [
+            t.reshape(bsz, num_blocks, context_size, self.num_heads, -1).transpose(2, 3)
+            for t in (query_states, key_states, value_states)
+        ]
 
         # shaw's relative positional embedding
-        seq = torch.arange(context_size, device=device)
+        seq = torch.arange(context_size, device=hidden_states.device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist, -context_size, context_size) + max_pos_emb
-        rel_pos_emb = self.rel_pos_emb(dist).to(q)
+        dist = torch.clamp(dist, -context_size, context_size) + self.max_pos_emb
+        rel_pos_emb = self.rel_pos_emb(dist).to(query_states)
         rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
-        pos_attn = torch.sum(q.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
+        pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
 
-        if nr > 0:
+        if remainder > 0:
             # masked attention in the extended block
-            mask = torch.ones(context_size, context_size, dtype=bool, device=device)
-            mask[:nr, :nr] = 0
+            mask = torch.ones(context_size, context_size, dtype=bool, device=hidden_states.device)
+            mask[:remainder, :remainder] = 0
             mask_value = -torch.finfo(pos_attn.dtype).max
             pos_attn[:, -1, :].masked_fill_(mask, mask_value)
 
         with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            out = F.scaled_dot_product_attention(q, k, v, attn_mask=pos_attn, scale=self.scale)
-        out = out.transpose(2, 3).reshape(bs, x.shape[1], -1)
-        out = self.to_out(out[:, :n, :])
+            out = F.scaled_dot_product_attention(
+                query_states, key_states, value_states, attn_mask=pos_attn, scale=self.scale
+            )
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        out = self.to_out(out[:, :num_features, :])
         return self.dropout(out)
 
 
-class GraniteSpeechConformerFeedForward(nn.Module):
-    def __init__(self, config: GraniteSpeechEncoderConfig):
-        super().__init__()
-        self.pre_norm = nn.LayerNorm(config.hidden_dim)
-        self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult)
-        self.act_fn = nn.SiLU()
-        self.dropout = nn.Dropout(config.dropout)
-        self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim)
-
-    def forward(self, x):
-        x = self.pre_norm(x)
-        x = self.up_proj(x)
-        x = self.dropout(self.act_fn(x))
-        x = self.down_proj(x)
-        x = self.dropout(x)
-        return x
-
-
 class GraniteSpeechConformerConvModule(nn.Module):
+    """Conformer conv module consisting of several 1D/depthwise 1D convolutional layers."""
+
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
         inner_dim = config.hidden_dim * config.conv_expansion_factor
@@ -242,38 +263,34 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, x):
-        x = self.norm(x)
-        x = self.up_conv(x.permute(0, 2, 1))
-        x = self.glu(x)
-        x = self.depth_conv(x)
-        x = self.silu(self.batch_norm(x))
-        x = self.down_conv(x).permute(0, 2, 1)
-        x = self.dropout(x)
-        return x
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.up_conv(hidden_states.permute(0, 2, 1))
+        hidden_states = self.glu(hidden_states)
+        hidden_states = self.depth_conv(hidden_states)
+        hidden_states = self.silu(self.batch_norm(hidden_states))
+        hidden_states = self.down_conv(hidden_states).permute(0, 2, 1)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
 
     @staticmethod
-    def calc_same_padding(kernel_size: int):
+    def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+        """Calculates symmetric padding for the depthwise 1D convolution."""
         pad = kernel_size // 2
         return (pad, pad - (kernel_size + 1) % 2)
 
 
-class GraniteSpeechConformerBlock(nn.Module):
-    def __init__(self, config: GraniteSpeechEncoderConfig):
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
+    """Wrapper for padded 1D pointwise convolution."""
+
+    def __init__(self, chan_in: int, chan_out: int, kernel_size: int, padding: Tuple[int, int]):
         super().__init__()
-        self.ff1 = GraniteSpeechConformerFeedForward(config)
-        self.attn = GraniteSpeechConformerAttention(config)
-        self.conv = GraniteSpeechConformerConvModule(config)
-        self.ff2 = GraniteSpeechConformerFeedForward(config)
-        self.post_norm = nn.LayerNorm(config.hidden_dim)
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
 
-    def forward(self, hidden_states, context_size):
-        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
-        hidden_states = self.attn(hidden_states, context_size) + hidden_states
-        hidden_states = self.conv(hidden_states) + hidden_states
-        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
-        hidden_states = self.post_norm(hidden_states)
-        return hidden_states
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.padding)
+        return self.conv(hidden_states)
 
 
 GRANITE_SPEECH_START_DOCSTRING = r"""
@@ -303,12 +320,15 @@ class GraniteSpeechPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
-    def _init_weights(self, module):
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
         std = self.config.initializer_range
+
         if isinstance(module, (nn.Linear, nn.Conv1d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -405,7 +425,7 @@ def __init__(self, config: GraniteSpeechConfig):
             self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
 
         self.encoder = GraniteSpeechCTCEncoder(config.encoder_config)
-        self.projector = GraniteSpeechEncoderProjectorQFormer(config)
+        self.projector = GraniteSpeechEncoderProjector(config)
 
         if config.has_lora_adapter and not is_peft_available():
             logger.warning(
@@ -429,9 +449,10 @@ def get_input_embeddings(self):
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
 
-    def get_audio_features(self, input_features):
+    def get_audio_features(self, input_features: torch.Tensor) -> torch.Tensor:
+        """Get the audio features to merged into the multimodal embeddings."""
         encoder_embeds = self.encoder(input_features)
-        projected_embeds = self.projector(encoder_embeds, None)
+        projected_embeds = self.projector(encoder_embeds)
         return projected_embeds
 
     @add_start_docstrings_to_model_forward(GRANITE_SPEECH_INPUTS_DOCSTRING)
@@ -468,11 +489,8 @@ def forward(
                 This is useful when using packed tensor format (single dimension for batch and sequence length).
 
         Returns:
-
-        Example:
-
-        TODO - add example for usage.
         """
+        # TODO (@alex-jw-brooks) add an example to this docstring once models are released
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -498,7 +516,6 @@ def forward(
 
         if input_features is not None:
             if input_features.dtype != self.dtype:
-                logger.warning(f"input features are casted to {self.dtype}")
                 input_features = input_features.to(self.dtype)
             # Get the audio features from the encoder / projector
             audio_features = self.get_audio_features(input_features)
@@ -585,11 +602,21 @@ def prepare_inputs_for_generation(
             model_inputs["input_features"] = input_features
         return model_inputs
 
-    def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_mask):
+    def get_merged_audio_embeddings(
+        self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         """
         Adds the audio token to the model's LLM vocabulary so that we can pass it
         through the tokenizer; it's assumed that the embeddings corresponding to the
         <|audio|> token will be clobbered with speech features.
+
+        Args:
+            input_ids (`torch.Tensor`):
+                Input IDs containing one or more audio tokens.
+            audio_features (`torch.Tensor`):
+                Audio features to be masked into the language embeddings to form multimodal embeddings.
+            input_features_mask (`torch.Tensor`, *optional*, defaults to `None`)
+                Mask to be applied to audio features prior to scattering into the language embeddings.
         """
         is_audio_index = input_ids == self.config.audio_token_index
         llm_input_ids = torch.where(is_audio_index, 0, input_ids)
@@ -600,7 +627,7 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
             assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), (
-                "number of features should align"
+                "Number of audio tokens does not match number of audio features"
             )
             audio_features = audio_features[input_features_mask]
 
@@ -611,11 +638,11 @@ def get_merged_audio_embeddings(self, input_ids, audio_features, input_features_
         return inputs_embeds
 
     def generate(self, *args, **kwargs) -> torch.LongTensor:
-        """This model is expected to have a lora adapter, which is only
-        enabled when considering audio inputs. As such, we override generate
-        to conditionally enable / disable the lora adapter based on whether
-        or not any input features were provided.
-        """
+        # This model is expected to have a lora adapter, which is only
+        # enabled when considering audio inputs. As such, we override generate
+        # to conditionally enable / disable the lora adapter based on whether
+        # or not any input features were provided.
+
         input_features = kwargs.pop("input_features", None)
         if is_peft_available and self._hf_peft_config_loaded:
             if input_features is not None:

From ea9381feeb876a8b86d2f77f2106b5ad1e8f7de4 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 09:05:36 +0000
Subject: [PATCH 094/116] Set attn context size in init

---
 .../granite_speech/modeling_granite_speech.py | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 5525100f3ed9..2f0a693cfc0b 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -128,13 +128,12 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
 
         self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
         self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
-        self.context_size = config.context_size
         self.num_layers = config.num_layers
 
     def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
-            hidden_states = layer(hidden_states, self.context_size)
+            hidden_states = layer(hidden_states)
             if idx == self.num_layers // 2:
                 hidden_states_mid = hidden_states.clone()
                 hidden_states_mid = self.out(hidden_states_mid)
@@ -153,9 +152,9 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.ff2 = GraniteSpeechConformerFeedForward(config)
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
-    def forward(self, hidden_states: torch.Tensor, context_size: int) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
-        hidden_states = self.attn(hidden_states, context_size) + hidden_states
+        hidden_states = self.attn(hidden_states) + hidden_states
         hidden_states = self.conv(hidden_states) + hidden_states
         hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
         hidden_states = self.post_norm(hidden_states)
@@ -190,6 +189,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
 
         inner_dim = config.dim_head * config.num_heads
         self.max_pos_emb = 512
+        self.context_size = config.context_size
         self.num_heads = config.num_heads
         self.dim_head = config.dim_head
         self.scale = self.dim_head**-0.5
@@ -200,37 +200,37 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, hidden_states: torch.Tensor, context_size: int) -> torch.Tensor:
-        if context_size <= 0 or context_size > self.max_pos_emb:
+        if self.context_size <= 0 or self.context_size > self.max_pos_emb:
             raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb")
 
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.pre_norm(hidden_states)
         bsz, num_features, _ = hidden_states.shape
 
-        num_blocks = math.ceil(num_features / context_size)
-        remainder = num_features % context_size
+        num_blocks = math.ceil(num_features / self.context_size)
+        remainder = num_features % self.context_size
         if remainder > 0:
             # right padding to reach block size
-            hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, context_size - remainder))
+            hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, self.context_size - remainder))
 
         query_states = self.to_q(hidden_states)
         key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
         query_states, key_states, value_states = [
-            t.reshape(bsz, num_blocks, context_size, self.num_heads, -1).transpose(2, 3)
+            t.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
             for t in (query_states, key_states, value_states)
         ]
 
         # shaw's relative positional embedding
-        seq = torch.arange(context_size, device=hidden_states.device)
+        seq = torch.arange(self.context_size, device=hidden_states.device)
         dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist, -context_size, context_size) + self.max_pos_emb
+        dist = torch.clamp(dist, -self.context_size, self.context_size) + self.max_pos_emb
         rel_pos_emb = self.rel_pos_emb(dist).to(query_states)
         rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
         pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
 
         if remainder > 0:
             # masked attention in the extended block
-            mask = torch.ones(context_size, context_size, dtype=bool, device=hidden_states.device)
+            mask = torch.ones(self.context_size, self.context_size, dtype=bool, device=hidden_states.device)
             mask[:remainder, :remainder] = 0
             mask_value = -torch.finfo(pos_attn.dtype).max
             pos_attn[:, -1, :].masked_fill_(mask, mask_value)

From dc951235f8c0cab068533e2cfc6b9a16aab86380 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 09:11:15 +0000
Subject: [PATCH 095/116] Add max pos emb to encoder config

---
 .../models/granite_speech/configuration_granite_speech.py     | 4 ++++
 .../models/granite_speech/modeling_granite_speech.py          | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 5f4b2b43b2bb..9cca41a17a4e 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -47,6 +47,8 @@ class GraniteSpeechEncoderConfig(PretrainedConfig):
             to be added to every other encoder block's output.
         context_size (`int`, *optional*, defaults to 200):
             Context size to be used in conformer attention.
+        max_pos_emb (`int`, *optional*, defaults to 512):
+            Max pos embeds to be used in attention (shaw's relative positional encoding).
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for fully connected layers in the encoder.
         conv_kernel_size (`int`, *optional*, defaults to 15):
@@ -81,6 +83,7 @@ def __init__(
         dim_head=128,
         output_dim=42,
         context_size=200,
+        max_pos_emb=512,
         dropout=0.1,
         conv_kernel_size=15,
         conv_expansion_factor=2,
@@ -98,6 +101,7 @@ def __init__(
         self.dropout = dropout
         self.conv_kernel_size = conv_kernel_size
         self.conv_expansion_factor = conv_expansion_factor
+        self.max_pos_emb = max_pos_emb
 
 
 class GraniteSpeechConfig(PretrainedConfig):
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 2f0a693cfc0b..8f93ebdca2c8 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -188,7 +188,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
 
         inner_dim = config.dim_head * config.num_heads
-        self.max_pos_emb = 512
+        self.max_pos_emb = config.max_pos_emb
         self.context_size = config.context_size
         self.num_heads = config.num_heads
         self.dim_head = config.dim_head

From a125ac8f4f7967923f07b1f276ea7f152c180087 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 09:32:50 +0000
Subject: [PATCH 096/116] Cleanup feature extractor

---
 .../feature_extraction_granite_speech.py      | 77 ++++++++++++-------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index fbdf6e568002..f5c90656fd90 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -16,7 +16,7 @@
 
 import math
 from collections.abc import Sequence
-from typing import List, Optional
+from typing import Optional
 
 import numpy as np
 
@@ -39,13 +39,13 @@ class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
 
     def __init__(
         self,
-        sampling_rate=16000,
-        n_fft=512,
-        win_length=400,
-        hop_length=160,
-        n_mels=80,
-        projector_window_size=15,
-        projector_downsample_rate=5,
+        sampling_rate: int = 16000,
+        n_fft: int = 512,
+        win_length: int = 400,
+        hop_length: int = 160,
+        n_mels: int = 80,
+        projector_window_size: int = 15,
+        projector_downsample_rate: int = 5,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -56,10 +56,7 @@ def __init__(
             "hop_length": hop_length,
             "n_mels": n_mels,
         }
-        # HACK - for now, lazily initialize the mel spectrogram transform;
-        # the feature extractor mixin explodes otherwise because
-        # it tries to log the feature extractor, and the melspectrogram
-        # transform isn't json serializable.
+        # Currently lazily initialized
         self.melspec = None
         self.projector_window_size = projector_window_size
         self.projector_downsample_rate = projector_downsample_rate
@@ -70,8 +67,7 @@ def __call__(
         device: Optional[str] = "cpu",
     ) -> BatchFeature:
         speech_inputs = {}
-        batched_audio, audio_lengths = self._get_validated_audios(audios)
-        # Calculate Mel features & the number of placeholders we will need
+        batched_audio, audio_lengths = self._get_audios_and_audio_lengths(audios)
         speech_inputs["input_features"] = self._extract_mel_spectrograms(
             batched_audio,
             device=device,
@@ -87,36 +83,54 @@ def __call__(
         return BatchFeature(data=speech_inputs)
 
     def _ensure_melspec_transform_is_initialized(self):
+        """
+        Ensures the mel spectrogram transform on this instance is initialized.
+
+        We do this for now since some logging explodes since the mel spectrogram
+        transform is not JSON serializable.
+        """
         if self.melspec is None:
             self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
 
-    def _extract_mel_spectrograms(self, x: torch.Tensor, device="cpu"):
-        # TODO there is probably a better way to do both of these things...
+    def _extract_mel_spectrograms(self, audio: torch.Tensor, device="cpu"):
+        """
+        Compute the Mel features to be passed to the conformer encoder.
+        """
+        # Initialize the mel spectrogram if isn't not already and
+        # move the melspec / audio to the computation device.
         self._ensure_melspec_transform_is_initialized()
         if device is not None:
             melspec = self.melspec.to(device)
-            x = x.to(device)
+            audio = audio.to(device)
         else:
             melspec = self.melspec
 
-        B, _ = x.shape
+        bsz = audio.shape[0]
         with torch.no_grad():
-            mel = melspec(x.float())
+            # Compute mel features
+            mel = melspec(audio.float())
             logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
             mx = logmel.amax(dim=(-2, -1), keepdim=True)
             logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
+            # remove last frame if odd
             if logmel.shape[1] % 2 == 1:
-                logmel = logmel[:, :-1]  # remove last frame if odd
-            x = logmel.reshape(B, -1, 2 * logmel.shape[-1])  # stacking and skipping by 2
+                logmel = logmel[:, :-1]
 
-        if x.device != "cpu":
-            return x.detach().cpu()
-        return x
+            # stacking and skipping by 2
+            audio = logmel.reshape(bsz, -1, 2 * logmel.shape[-1])
 
-    def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
+        if audio.device != "cpu":
+            return audio.detach().cpu()
+        return audio
+
+    def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]:
         """
-        Gets the (variable length) variable length number of features
-        (i.e., projector output) for the sequences being considered.
+        Gets the (variable length) number of features (i.e., projector output) for the sequences
+        being considered.
+
+        Args:
+            audio_lengths (`Sequence[int]`):
+                Sequence of one or more raw audio lengths.
         """
         hop_length = self.melspec_kwargs["hop_length"]
         effective_window_size = self.projector_window_size // self.projector_downsample_rate
@@ -134,7 +148,14 @@ def _get_num_audio_features(self, audio_lengths: List[int]) -> List[int]:
 
         return projector_lengths
 
-    def _get_validated_audios(self, audios: AudioInput):
+    def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence[torch.Tensor, Sequence[int]]:
+        """
+        Coerces audio inputs to torch tensors and extracts audio lengths prior to stacking.
+
+        Args:
+            audios (`AudioInput`):
+                Audio sequence, numpy array, or torch tensor.
+        """
         # Coerce to PyTorch tensors if we have numpy arrays, since
         # currently we have a dependency on torch/torchaudio anyway
         if isinstance(audios, np.ndarray):

From bc887972a011b5973c473394ff5bfc3b8b3bfd02 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 10:15:34 +0000
Subject: [PATCH 097/116] Add granite speech architecture details

---
 docs/source/en/model_doc/granite_speech.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/granite_speech.md b/docs/source/en/model_doc/granite_speech.md
index 3480b379fbfb..212c3d149935 100644
--- a/docs/source/en/model_doc/granite_speech.md
+++ b/docs/source/en/model_doc/granite_speech.md
@@ -21,7 +21,26 @@ rendered properly in your Markdown viewer.
 </div>
 
 ## Overview
-Currently being updated!
+The Granite Speech model is a multimodal language model, consisting of a speech encoder, speech projector, large language model, and LoRA adapter(s). More details regarding each component for the current (Granite 3.2 Speech) model architecture may be found below.
+
+1. Speech Encoder: A [Conformer](https://arxiv.org/abs/2005.08100) encoder trained with Connectionist Temporal Classification (CTC) on character-level targets on ASR corpora. The encoder uses block-attention and self-conditioned CTC from the middle layer.
+
+2. Speech Projector: A query transformer (q-former) operating on the outputs of the last encoder block. The encoder and projector temporally downsample the audio features to be merged into the multimodal embeddings to be processed by the llm.
+
+3. Large Language Model: The Granite Speech model leverages Granite LLMs, which were originally proposed in [this paper](https://arxiv.org/abs/2408.13359).
+
+4. LoRA adapter(s): The Granite Speech model contains a modality specific LoRA, which will be enabled when audio features are provided, and disabled otherwise.
+
+
+Note that most of the aforementioned components are implemented generically to enable compatability and potential integration with other model architectures in transformers.
+
+
+This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944), [Avihu Dekel](https://huggingface.co/Avihu), and [George Saon](https://huggingface.co/gsaon).
+
+## Usage tips
+- This model bundles its own LoRA adapter, which will be automatically loaded and enabled/disabled as needed during inference calls. Be sure to install [PEFT](https://github.com/huggingface/peft) to ensure the LoRA is correctly applied!
+
+<!-- TODO (@alex-jw-brooks) Add an example here once the model compatible with the transformers implementation is released -->
 
 ## GraniteSpeechConfig
 

From c4a9f64ef3e8818dc0b35df1289d60ad10ca4fa4 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 4 Apr 2025 11:00:07 +0000
Subject: [PATCH 098/116] Remove granite speech qformer ref

---
 utils/check_repo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 0338b84ee695..4dcfadefc9e2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -195,7 +195,6 @@
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",
-    "GraniteSpeechQFormerModel",
     "BlipForImageTextRetrieval",
     "BlipForQuestionAnswering",
     "BlipVisionModel",

From f207aac34fd0841a9cd4675e312846f2e1f2dc52 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 19:04:50 +0000
Subject: [PATCH 099/116] Add paper link, explicit calc for qkv

---
 .../models/granite_speech/modeling_granite_speech.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 8f93ebdca2c8..2c2ec3af5931 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -182,7 +182,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class GraniteSpeechConformerAttention(nn.Module):
-    """Attention for conformer blocks with shaw's relpos embeddings."""
+    """Attention for conformer blocks using Shaw's relative positional embeddings.
+    See the following [paper](https://arxiv.org/pdf/1803.02155) for more details.
+    """
 
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
@@ -215,10 +217,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         query_states = self.to_q(hidden_states)
         key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
-        query_states, key_states, value_states = [
-            t.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
-            for t in (query_states, key_states, value_states)
-        ]
+
+        query_states = query_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
+        key_states = key_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
+        value_states = value_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
 
         # shaw's relative positional embedding
         seq = torch.arange(self.context_size, device=hidden_states.device)

From 882fb639899937474725bfc0cb55d25b25a08610 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 19:14:25 +0000
Subject: [PATCH 100/116] Calculate padding directly in depthwise conv1d init

---
 .../models/granite_speech/modeling_granite_speech.py  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 2c2ec3af5931..5cabd59e2ee3 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -252,13 +252,12 @@ class GraniteSpeechConformerConvModule(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
         inner_dim = config.hidden_dim * config.conv_expansion_factor
-        padding = self.calc_same_padding(config.conv_kernel_size)
 
         self.norm = nn.LayerNorm(config.hidden_dim)
         self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
         self.glu = nn.GLU(dim=1)
         self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
-            inner_dim, inner_dim, kernel_size=config.conv_kernel_size, padding=padding
+            inner_dim, inner_dim, kernel_size=config.conv_kernel_size,
         )
         self.silu = nn.SiLU()
         self.batch_norm = nn.BatchNorm1d(inner_dim)
@@ -285,9 +284,13 @@ def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
 class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
     """Wrapper for padded 1D pointwise convolution."""
 
-    def __init__(self, chan_in: int, chan_out: int, kernel_size: int, padding: Tuple[int, int]):
+    def __init__(self, chan_in: int, chan_out: int, kernel_size: int):
         super().__init__()
-        self.padding = padding
+        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
+        pad = kernel_size // 2
+        pad_offset = (kernel_size + 1) % 2
+        self.padding = (pad, pad - pad_offset)
+
         self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

From e7db05a2ad346e12e7b3c680158f18f6345f4c66 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 19:17:15 +0000
Subject: [PATCH 101/116] Raise value error instead of asserting

---
 .../models/granite_speech/modeling_granite_speech.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 5cabd59e2ee3..b037739354b5 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -631,9 +631,9 @@ def get_merged_audio_embeddings(
         special_audio_mask = is_audio_index.unsqueeze(-1)
         audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
         if input_features_mask is not None:
-            assert torch.all(is_audio_index.int().sum(dim=1) == input_features_mask.int().sum(dim=1)).item(), (
-                "Number of audio tokens does not match number of audio features"
-            )
+            if torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)).item():
+                raise ValueError("Number of audio tokens does not match number of audio features")
+
             audio_features = audio_features[input_features_mask]
 
         inputs_embeds = inputs_embeds.masked_scatter(

From fe8242d7e8c0b06a1b04eb94fb4eaa4f7b8e3317 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 19:22:28 +0000
Subject: [PATCH 102/116] Reorder class defs (classes used at top)

---
 .../granite_speech/modeling_granite_speech.py | 110 ++++++++----------
 1 file changed, 51 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index b037739354b5..cd5e8f792480 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -119,48 +119,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 ### Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git
-class GraniteSpeechCTCEncoder(nn.Module):
-    def __init__(self, config: GraniteSpeechEncoderConfig):
-        super().__init__()
-        self.config = config
-        self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
-        self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
-
-        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
-        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
-        self.num_layers = config.num_layers
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.input_linear(hidden_states)
-        for idx, layer in enumerate(self.layers, start=1):
-            hidden_states = layer(hidden_states)
-            if idx == self.num_layers // 2:
-                hidden_states_mid = hidden_states.clone()
-                hidden_states_mid = self.out(hidden_states_mid)
-                hidden_states += self.out_mid(nn.Softmax(dim=-1)(hidden_states_mid))
-        return hidden_states
-
-
-class GraniteSpeechConformerBlock(nn.Module):
-    """Conformer block, consisting largely of linear layers, attention, and convolutional layers."""
-
-    def __init__(self, config: GraniteSpeechEncoderConfig):
-        super().__init__()
-        self.ff1 = GraniteSpeechConformerFeedForward(config)
-        self.attn = GraniteSpeechConformerAttention(config)
-        self.conv = GraniteSpeechConformerConvModule(config)
-        self.ff2 = GraniteSpeechConformerFeedForward(config)
-        self.post_norm = nn.LayerNorm(config.hidden_dim)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
-        hidden_states = self.attn(hidden_states) + hidden_states
-        hidden_states = self.conv(hidden_states) + hidden_states
-        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
-        hidden_states = self.post_norm(hidden_states)
-        return hidden_states
-
-
 class GraniteSpeechConformerFeedForward(nn.Module):
     """Feedforward module for conformer encoder blocks."""
 
@@ -180,7 +138,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dropout(hidden_states)
         return hidden_states
 
-
 class GraniteSpeechConformerAttention(nn.Module):
     """Attention for conformer blocks using Shaw's relative positional embeddings.
     See the following [paper](https://arxiv.org/pdf/1803.02155) for more details.
@@ -246,6 +203,23 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.dropout(out)
 
 
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
+    """Wrapper for padded 1D pointwise convolution."""
+
+    def __init__(self, chan_in: int, chan_out: int, kernel_size: int):
+        super().__init__()
+        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
+        pad = kernel_size // 2
+        pad_offset = (kernel_size + 1) % 2
+        self.padding = (pad, pad - pad_offset)
+
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.padding)
+        return self.conv(hidden_states)
+
+
 class GraniteSpeechConformerConvModule(nn.Module):
     """Conformer conv module consisting of several 1D/depthwise 1D convolutional layers."""
 
@@ -274,28 +248,46 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dropout(hidden_states)
         return hidden_states
 
-    @staticmethod
-    def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
-        """Calculates symmetric padding for the depthwise 1D convolution."""
-        pad = kernel_size // 2
-        return (pad, pad - (kernel_size + 1) % 2)
+class GraniteSpeechConformerBlock(nn.Module):
+    """Conformer block, consisting largely of linear layers, attention, and convolutional layers."""
 
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__()
+        self.ff1 = GraniteSpeechConformerFeedForward(config)
+        self.attn = GraniteSpeechConformerAttention(config)
+        self.conv = GraniteSpeechConformerConvModule(config)
+        self.ff2 = GraniteSpeechConformerFeedForward(config)
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
 
-class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
-    """Wrapper for padded 1D pointwise convolution."""
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(hidden_states) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
 
-    def __init__(self, chan_in: int, chan_out: int, kernel_size: int):
+
+class GraniteSpeechCTCEncoder(nn.Module):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
-        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
-        pad = kernel_size // 2
-        pad_offset = (kernel_size + 1) % 2
-        self.padding = (pad, pad - pad_offset)
+        self.config = config
+        self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
+        self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
 
-        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False)
+        self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True)
+        self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True)
+        self.num_layers = config.num_layers
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = F.pad(hidden_states, self.padding)
-        return self.conv(hidden_states)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.input_linear(hidden_states)
+        for idx, layer in enumerate(self.layers, start=1):
+            hidden_states = layer(hidden_states)
+            if idx == self.num_layers // 2:
+                hidden_states_mid = hidden_states.clone()
+                hidden_states_mid = self.out(hidden_states_mid)
+                hidden_states += self.out_mid(nn.Softmax(dim=-1)(hidden_states_mid))
+        return hidden_states
 
 
 GRANITE_SPEECH_START_DOCSTRING = r"""

From 70d84ea43e1664eaa1ff533056982fc73abeadac Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 19:58:24 +0000
Subject: [PATCH 103/116] Precompute relpos distances

---
 .../models/granite_speech/modeling_granite_speech.py  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index cd5e8f792480..48dd8505828a 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -159,6 +159,11 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
         self.dropout = nn.Dropout(config.dropout)
 
+        # Precompute clamped relative positional encoding distances
+        seq = torch.arange(self.context_size)
+        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
+        self.relpos_dist = torch.clamp(relpos_dist, -self.context_size, self.context_size) + self.max_pos_emb
+
         if self.context_size <= 0 or self.context_size > self.max_pos_emb:
             raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb")
 
@@ -180,10 +185,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         value_states = value_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
 
         # shaw's relative positional embedding
-        seq = torch.arange(self.context_size, device=hidden_states.device)
-        dist = seq.view(-1, 1) - seq.view(1, -1)
-        dist = torch.clamp(dist, -self.context_size, self.context_size) + self.max_pos_emb
-        rel_pos_emb = self.rel_pos_emb(dist).to(query_states)
+        dist = self.relpos_dist.to(hidden_states.device)
+        rel_pos_emb = self.rel_pos_emb(dist)
         rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
         pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
 

From 4d7e7944053a27a33545c859c9c12ab1d6647dbd Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 20:07:55 +0000
Subject: [PATCH 104/116] Run formatting

---
 .../models/granite_speech/modeling_granite_speech.py        | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 48dd8505828a..e664d6774441 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -138,6 +138,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dropout(hidden_states)
         return hidden_states
 
+
 class GraniteSpeechConformerAttention(nn.Module):
     """Attention for conformer blocks using Shaw's relative positional embeddings.
     See the following [paper](https://arxiv.org/pdf/1803.02155) for more details.
@@ -234,7 +235,9 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
         self.glu = nn.GLU(dim=1)
         self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
-            inner_dim, inner_dim, kernel_size=config.conv_kernel_size,
+            inner_dim,
+            inner_dim,
+            kernel_size=config.conv_kernel_size,
         )
         self.silu = nn.SiLU()
         self.batch_norm = nn.BatchNorm1d(inner_dim)
@@ -251,6 +254,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.dropout(hidden_states)
         return hidden_states
 
+
 class GraniteSpeechConformerBlock(nn.Module):
     """Conformer block, consisting largely of linear layers, attention, and convolutional layers."""
 

From c419426e2e4777f9d1d095e8c5d2278ca21330e4 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Wed, 9 Apr 2025 21:31:53 +0000
Subject: [PATCH 105/116] Pass attention distances through forward

---
 .../granite_speech/modeling_granite_speech.py | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index e664d6774441..6c8aa65bbdf9 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -160,15 +160,10 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head)
         self.dropout = nn.Dropout(config.dropout)
 
-        # Precompute clamped relative positional encoding distances
-        seq = torch.arange(self.context_size)
-        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
-        self.relpos_dist = torch.clamp(relpos_dist, -self.context_size, self.context_size) + self.max_pos_emb
-
         if self.context_size <= 0 or self.context_size > self.max_pos_emb:
             raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb")
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor:
         hidden_states = self.pre_norm(hidden_states)
         bsz, num_features, _ = hidden_states.shape
 
@@ -186,7 +181,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         value_states = value_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3)
 
         # shaw's relative positional embedding
-        dist = self.relpos_dist.to(hidden_states.device)
+        dist = attention_dists.to(hidden_states.device)
         rel_pos_emb = self.rel_pos_emb(dist)
         rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape))
         pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale
@@ -266,9 +261,9 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.ff2 = GraniteSpeechConformerFeedForward(config)
         self.post_norm = nn.LayerNorm(config.hidden_dim)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor:
         hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
-        hidden_states = self.attn(hidden_states) + hidden_states
+        hidden_states = self.attn(hidden_states, attention_dists=attention_dists) + hidden_states
         hidden_states = self.conv(hidden_states) + hidden_states
         hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
         hidden_states = self.post_norm(hidden_states)
@@ -279,6 +274,12 @@ class GraniteSpeechCTCEncoder(nn.Module):
     def __init__(self, config: GraniteSpeechEncoderConfig):
         super().__init__()
         self.config = config
+
+        # Precompute clamped relative positional encoding distances
+        seq = torch.arange(config.context_size)
+        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
+        self.attention_dists = torch.clamp(relpos_dist, -config.context_size, config.context_size) + config.max_pos_emb
+
         self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True)
         self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)])
 
@@ -289,7 +290,8 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
     def forward(self, hidden_states: torch.Tensor):
         hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
-            hidden_states = layer(hidden_states)
+            hidden_states = layer(hidden_states, attention_dists=self.attention_dists)
+
             if idx == self.num_layers // 2:
                 hidden_states_mid = hidden_states.clone()
                 hidden_states_mid = self.out(hidden_states_mid)

From b281bc4f15a80a5fc9a1a2182d8fe5e742a11f0f Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Thu, 10 Apr 2025 12:50:39 -0600
Subject: [PATCH 106/116] Apply suggestions from code review

Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>
---
 .../granite_speech/feature_extraction_granite_speech.py     | 6 +++---
 .../models/granite_speech/modeling_granite_speech.py        | 2 +-
 .../models/granite_speech/processing_granite_speech.py      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index f5c90656fd90..a1e57a62bef0 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
-from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from transformers.tokenization_utils_base import AudioInput
-from transformers.utils import is_torch_available, is_torchaudio_available, logging
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...tokenization_utils_base import AudioInput
+from ...utils import is_torch_available, is_torchaudio_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 6c8aa65bbdf9..e28eb3e131cc 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -524,7 +524,7 @@ def forward(
             if input_features.dtype != self.dtype:
                 input_features = input_features.to(self.dtype)
             # Get the audio features from the encoder / projector
-            audio_features = self.get_audio_features(input_features)
+            audio_embeds = self.get_audio_features(input_features)
 
             # Merge the audio features into the LLM embeddings
             inputs_embeds = self.get_merged_audio_embeddings(
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 3a68cc39137c..0f3512a8ee49 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -46,7 +46,7 @@ def __init__(
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
-        audios: Union[torch.Tensor, List[torch.Tensor]] = None,
+        audio: Union[torch.Tensor, List[torch.Tensor]] = None,
         device: str = "cpu",
         **kwargs,
     ) -> BatchFeature:

From d66fb7b9cd1c8c0a5b612c437f074adc32e34b8b Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 10 Apr 2025 19:15:14 +0000
Subject: [PATCH 107/116] Add todo for using common batch feature extraction

---
 .../models/granite_speech/feature_extraction_granite_speech.py  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index a1e57a62bef0..392b6fe6407a 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -90,6 +90,8 @@ def _ensure_melspec_transform_is_initialized(self):
         transform is not JSON serializable.
         """
         if self.melspec is None:
+            # TODO (@alex-jw-brooks / @eustlb) move this to common batch
+            # feature extraction in audio utils once they are written!
             self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
 
     def _extract_mel_spectrograms(self, audio: torch.Tensor, device="cpu"):

From cce62532a2c6e770205e64ce31a550d1a9e14b4f Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 10 Apr 2025 19:18:07 +0000
Subject: [PATCH 108/116] Rename audios/features

---
 .../granite_speech/modeling_granite_speech.py |  2 +-
 .../processing_granite_speech.py              |  4 ++--
 .../test_modeling_granite_speech.py           |  2 +-
 .../test_processor_granite_speech.py          | 22 +++++++++----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index e28eb3e131cc..f4e451d118b7 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -529,7 +529,7 @@ def forward(
             # Merge the audio features into the LLM embeddings
             inputs_embeds = self.get_merged_audio_embeddings(
                 input_ids=input_ids,
-                audio_features=audio_features,
+                audio_features=audio_embeds,
                 input_features_mask=input_features_mask,
             )
 
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 0f3512a8ee49..c6f91c71ed3c 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -53,12 +53,12 @@ def __call__(
         text = self._get_validated_text(text)
         prompt_strings = text
 
-        if audios is not None:
+        if audio is not None:
             # NOTE - we intentionally avoid throwing for potentially misaligned
             # text / audio inputs here because some inference engines will
             # trigger the conditions due to the way they call multimodal
             # processors, e.g., vLLM.
-            audio_inputs = self.audio_processor(audios, device=device)
+            audio_inputs = self.audio_processor(audio, device=device)
             audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
 
             # Expand the audio placeholders to match the feature dims; this
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index ec19382379f9..02b1c4600b9e 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -384,7 +384,7 @@ def test_small_model_integration_test_batch(self):
 
         EXPECTED_DECODED_TEXT = [
             "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mr quilter's manner less interesting than his matter"
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilter's manner less interesting than his matter"
         ]  # fmt: skip
 
         self.assertEqual(
diff --git a/tests/models/granite_speech/test_processor_granite_speech.py b/tests/models/granite_speech/test_processor_granite_speech.py
index 6d0ab43b3514..a566658f63df 100644
--- a/tests/models/granite_speech/test_processor_granite_speech.py
+++ b/tests/models/granite_speech/test_processor_granite_speech.py
@@ -90,7 +90,7 @@ def test_bad_text_fails(self):
 
         processor = GraniteSpeechProcessor(tokenizer=tokenizer, audio_processor=audio_processor)
         with pytest.raises(TypeError):
-            processor(text=424, audios=None)
+            processor(text=424, audio=None)
 
     def test_bad_nested_text_fails(self):
         """Ensure we gracefully fail if text is the wrong nested type."""
@@ -102,9 +102,9 @@ def test_bad_nested_text_fails(self):
         )
 
         with pytest.raises(TypeError):
-            processor(text=[424], audios=None)
+            processor(text=[424], audio=None)
 
-    def test_bad_audios_fails(self):
+    def test_bad_audio_fails(self):
         """Ensure we gracefully fail if audio is the wrong type."""
         tokenizer = self.get_tokenizer()
         audio_processor = self.get_audio_processor()
@@ -114,9 +114,9 @@ def test_bad_audios_fails(self):
         )
 
         with pytest.raises(TypeError):
-            processor(text=None, audios="foo")
+            processor(text=None, audio="foo")
 
-    def test_nested_bad_audios_fails(self):
+    def test_nested_bad_audio_fails(self):
         """Ensure we gracefully fail if audio is the wrong nested type."""
         tokenizer = self.get_tokenizer()
         audio_processor = self.get_audio_processor()
@@ -126,7 +126,7 @@ def test_nested_bad_audios_fails(self):
         )
 
         with pytest.raises(TypeError):
-            processor(text=None, audios=["foo"])
+            processor(text=None, audio=["foo"])
 
     @parameterized.expand(
         [
@@ -147,10 +147,10 @@ def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expect
             tokenizer=tokenizer,
             audio_processor=audio_processor,
         )
-        audios = random_func(*vec_dims) - 0.5
+        audio = random_func(*vec_dims) - 0.5
 
         audio_tokens = processor.audio_token * vec_dims[0]
-        inputs = processor(text=f"{audio_tokens} Can you compare this audio?", audios=audios, return_tensors="pt")
+        inputs = processor(text=f"{audio_tokens} Can you compare this audio?", audio=audio, return_tensors="pt")
 
         # Check the number of audio tokens
         audio_token_id = tokenizer.get_vocab()[processor.audio_token]
@@ -175,14 +175,14 @@ def test_audio_token_filling_varying_len_feature_list(self):
         )
         vec_dims = [[1, 142100], [1, 269920]]
         num_expected_features = [90, 171]
-        audios = [torch.rand(dims) - 0.5 for dims in vec_dims]
+        audio = [torch.rand(dims) - 0.5 for dims in vec_dims]
 
         inputs = processor(
             text=[
                 f"{processor.audio_token} Can you describe this audio?",
                 f"{processor.audio_token} How does it compare with this audio?",
             ],
-            audios=audios,
+            audio=audio,
             return_tensors="pt",
         )
 
@@ -214,7 +214,7 @@ def test_device_override(self):
 
         inputs = processor(
             text=f"{processor.audio_token} Can you transcribe this audio?",
-            audios=wav,
+            audio=wav,
             return_tensors="pt",
             device="cuda",
         )

From 15ecc0781a6f06629daceea6b5790454d9b2ba0b Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 10 Apr 2025 21:22:30 +0000
Subject: [PATCH 109/116] Ensure chat template may be provided to processor

---
 .../models/granite_speech/processing_granite_speech.py       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index c6f91c71ed3c..5416b59f5d5d 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -39,15 +39,18 @@ def __init__(
         audio_processor,
         tokenizer,
         audio_token="<|audio|>",
+        chat_template=None,
     ):
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
-        super().__init__(audio_processor, tokenizer)
+        super().__init__(audio_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         audio: Union[torch.Tensor, List[torch.Tensor]] = None,
         device: str = "cpu",
+        images=None,
+        videos=None,
         **kwargs,
     ) -> BatchFeature:
         text = self._get_validated_text(text)

From 6819ea6a8ed0ea7419d791506930f8c149d66ca8 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Thu, 10 Apr 2025 21:46:07 +0000
Subject: [PATCH 110/116] Move granite speech docs to audio models

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0541d5f24cc1..fa1aa7491c2d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -489,8 +489,6 @@
         title: GraniteMoe
       - local: model_doc/granitemoeshared
         title: GraniteMoeShared
-      - local: model_doc/granite_speech
-        title: GraniteSpeech
       - local: model_doc/granitevision
         title: GraniteVision
       - local: model_doc/helium
@@ -825,6 +823,8 @@
         title: EnCodec
       - local: model_doc/fastspeech2_conformer
         title: FastSpeech2Conformer
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct

From ab60ab6a570b22484e5ae3a1a4e1304625a9d723 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 09:03:27 +0000
Subject: [PATCH 111/116] Add todos for input proc refactoring

---
 .../feature_extraction_granite_speech.py             | 12 +++++++++---
 .../granite_speech/processing_granite_speech.py      |  4 ++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index 392b6fe6407a..a974212f13a1 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -74,9 +74,15 @@ def __call__(
         )
         audio_embed_sizes = self._get_num_audio_features(audio_lengths)
         speech_inputs["audio_embed_sizes"] = audio_embed_sizes
-        # TODO: input_features_mask is not a great name, because
-        # input_features and input_features_mask have different shapes
-        # (before/after the projector)
+        # TODO (@alex-jw-brooks): Currently input_features_mask is not
+        # a great name, because input_features and input_features_mask
+        # have different shapes (before/after the projector).
+        #
+        # We should align this with other multimodal models, e.g,. llava
+        # and qwen2audio and refactor this to ensure input_feature_mask
+        # has the same dimensionality as input_features, or compute it in
+        # the model based on the audio embedding sizes (since we do not
+        # have an attention mask for the audio features to infer padding from).
         speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
             audio_embed_sizes
         ).view(-1, 1)
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 5416b59f5d5d..2c346c79471c 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -62,6 +62,10 @@ def __call__(
             # trigger the conditions due to the way they call multimodal
             # processors, e.g., vLLM.
             audio_inputs = self.audio_processor(audio, device=device)
+
+            # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens
+            # from feature lengths and call it here, rather than returning it
+            # from the feature extractor.
             audio_embed_sizes = audio_inputs.pop("audio_embed_sizes")
 
             # Expand the audio placeholders to match the feature dims; this

From ebf694a6d18b93477d61b665f1c8fd0074754179 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 09:30:24 +0000
Subject: [PATCH 112/116] Fix import order

---
 src/transformers/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 5d30143f9806..43eebbcb3a5a 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -125,9 +125,9 @@
     from .gpt_sw3 import *
     from .gptj import *
     from .granite import *
+    from .granite_speech import *
     from .granitemoe import *
     from .granitemoeshared import *
-    from .granite_speech import *
     from .grounding_dino import *
     from .groupvit import *
     from .helium import *

From 8ce898d6c3422ebcc9fd4d5d40d6d50f0bd271dd Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 09:46:41 +0000
Subject: [PATCH 113/116] Guard torch import

---
 .../models/granite_speech/processing_granite_speech.py     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 2c346c79471c..1366ecf24548 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -16,13 +16,14 @@
 
 from typing import List, Union
 
-import torch
-
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils import PreTokenizedInput, TextInput
-from transformers.utils import logging
+from transformers.utils import is_torch_available, logging
+
 
+if is_torch_available():
+    import torch
 
 logger = logging.get_logger(__name__)
 

From c0ca6bc61f1a2439a5ba235ce6120fe23915be99 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 09:53:22 +0000
Subject: [PATCH 114/116] Use relative imports

---
 .../granite_speech/configuration_granite_speech.py |  4 ++--
 .../granite_speech/modeling_granite_speech.py      | 14 +++++---------
 .../granite_speech/processing_granite_speech.py    |  8 ++++----
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
index 9cca41a17a4e..e1355db41fdc 100644
--- a/src/transformers/models/granite_speech/configuration_granite_speech.py
+++ b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """Config class for Granite Speech."""
 
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING, AutoConfig
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
 
 
 class GraniteSpeechEncoderConfig(PretrainedConfig):
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index f4e451d118b7..821539d41639 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -19,23 +19,19 @@
 
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
 from torch import nn
 
-from transformers.generation import GenerationMixin
-from transformers.modeling_outputs import (
-    ModelOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto import AutoModel, AutoModelForCausalLM
-from transformers.utils import (
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_peft_available,
     logging,
     replace_return_docstrings,
 )
-
+from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_granite_speech import (
     GraniteSpeechConfig,
     GraniteSpeechEncoderConfig,
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 1366ecf24548..25badd505e6b 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -16,10 +16,10 @@
 
 from typing import List, Union
 
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils import PreTokenizedInput, TextInput
-from transformers.utils import is_torch_available, logging
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils import PreTokenizedInput, TextInput
+from ...utils import is_torch_available, logging
 
 
 if is_torch_available():

From 93486bf589d5d1cf06719b8d6699b25e0492257b Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 10:00:28 +0000
Subject: [PATCH 115/116] Require torch backend for processor in granite speech

---
 .../models/granite_speech/processing_granite_speech.py       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 25badd505e6b..ec36eb497031 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
+from ...utils.import_utils import requires_backends
 
 
 if is_torch_available():
@@ -48,12 +49,14 @@ def __init__(
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
-        audio: Union[torch.Tensor, List[torch.Tensor]] = None,
+        audio: Union["torch.Tensor", List["torch.Tensor"]] = None,
         device: str = "cpu",
         images=None,
         videos=None,
         **kwargs,
     ) -> BatchFeature:
+        requires_backends(self, ["torch"])
+
         text = self._get_validated_text(text)
         prompt_strings = text
 

From 677b4e524e7c891bc2ffbb682de773f5838b7555 Mon Sep 17 00:00:00 2001
From: Alex-Brooks <Alex.Brooks@ibm.com>
Date: Fri, 11 Apr 2025 10:10:39 +0000
Subject: [PATCH 116/116] Add backend guards in feature extractor

---
 .../feature_extraction_granite_speech.py            | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index a974212f13a1..14b4bb10c433 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -23,6 +23,7 @@
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ...tokenization_utils_base import AudioInput
 from ...utils import is_torch_available, is_torchaudio_available, logging
+from ...utils.import_utils import requires_backends
 
 
 logger = logging.get_logger(__name__)
@@ -66,6 +67,8 @@ def __call__(
         audios: AudioInput,
         device: Optional[str] = "cpu",
     ) -> BatchFeature:
+        requires_backends(self, ["torchaudio"])
+
         speech_inputs = {}
         batched_audio, audio_lengths = self._get_audios_and_audio_lengths(audios)
         speech_inputs["input_features"] = self._extract_mel_spectrograms(
@@ -95,15 +98,19 @@ def _ensure_melspec_transform_is_initialized(self):
         We do this for now since some logging explodes since the mel spectrogram
         transform is not JSON serializable.
         """
+        requires_backends(self, ["torchaudio"])
+
         if self.melspec is None:
             # TODO (@alex-jw-brooks / @eustlb) move this to common batch
             # feature extraction in audio utils once they are written!
             self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
 
-    def _extract_mel_spectrograms(self, audio: torch.Tensor, device="cpu"):
+    def _extract_mel_spectrograms(self, audio: "torch.Tensor", device="cpu"):
         """
         Compute the Mel features to be passed to the conformer encoder.
         """
+        requires_backends(self, ["torchaudio"])
+
         # Initialize the mel spectrogram if isn't not already and
         # move the melspec / audio to the computation device.
         self._ensure_melspec_transform_is_initialized()
@@ -156,7 +163,7 @@ def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]
 
         return projector_lengths
 
-    def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence[torch.Tensor, Sequence[int]]:
+    def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence["torch.Tensor", Sequence[int]]:
         """
         Coerces audio inputs to torch tensors and extracts audio lengths prior to stacking.
 
@@ -164,6 +171,8 @@ def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence[torch.Te
             audios (`AudioInput`):
                 Audio sequence, numpy array, or torch tensor.
         """
+        requires_backends(self, ["torch"])
+
         # Coerce to PyTorch tensors if we have numpy arrays, since
         # currently we have a dependency on torch/torchaudio anyway
         if isinstance(audios, np.ndarray):