huggingface · eustlb · Apr 11, 2025 · Mar 13, 2025 · Mar 14, 2025 · Mar 15, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -823,6 +823,8 @@
         title: EnCodec
       - local: model_doc/fastspeech2_conformer
         title: FastSpeech2Conformer
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct

diff --git a/docs/source/en/model_doc/granite_speech.md b/docs/source/en/model_doc/granite_speech.md
@@ -0,0 +1,68 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Granite Speech
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+The Granite Speech model is a multimodal language model, consisting of a speech encoder, speech projector, large language model, and LoRA adapter(s). More details regarding each component for the current (Granite 3.2 Speech) model architecture may be found below.
+
+1. Speech Encoder: A [Conformer](https://arxiv.org/abs/2005.08100) encoder trained with Connectionist Temporal Classification (CTC) on character-level targets on ASR corpora. The encoder uses block-attention and self-conditioned CTC from the middle layer.
+
+2. Speech Projector: A query transformer (q-former) operating on the outputs of the last encoder block. The encoder and projector temporally downsample the audio features to be merged into the multimodal embeddings to be processed by the llm.
+
+3. Large Language Model: The Granite Speech model leverages Granite LLMs, which were originally proposed in [this paper](https://arxiv.org/abs/2408.13359).
+
+4. LoRA adapter(s): The Granite Speech model contains a modality specific LoRA, which will be enabled when audio features are provided, and disabled otherwise.
+
+
+Note that most of the aforementioned components are implemented generically to enable compatability and potential integration with other model architectures in transformers.
+
+
+This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944), [Avihu Dekel](https://huggingface.co/Avihu), and [George Saon](https://huggingface.co/gsaon).
+
+## Usage tips
+- This model bundles its own LoRA adapter, which will be automatically loaded and enabled/disabled as needed during inference calls. Be sure to install [PEFT](https://github.com/huggingface/peft) to ensure the LoRA is correctly applied!
+
+<!-- TODO (@alex-jw-brooks) Add an example here once the model compatible with the transformers implementation is released -->
+
+## GraniteSpeechConfig
+
+[[autodoc]] GraniteSpeechConfig
+
+
+## GraniteSpeechEncoderConfig
+
+[[autodoc]] GraniteSpeechEncoderConfig
+
+
+## GraniteSpeechProcessor
+
+[[autodoc]] GraniteSpeechProcessor
+
+
+## GraniteSpeechFeatureExtractor
+
+[[autodoc]] GraniteSpeechFeatureExtractor
+
+
+## GraniteSpeechForConditionalGeneration
+
+[[autodoc]] GraniteSpeechForConditionalGeneration
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -125,6 +125,7 @@
     from .gpt_sw3 import *
     from .gptj import *
     from .granite import *
+    from .granite_speech import *
     from .granitemoe import *
     from .granitemoeshared import *
     from .grounding_dino import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -142,6 +142,7 @@
         ("gptj", "GPTJConfig"),
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("granite", "GraniteConfig"),
+        ("granite_speech", "GraniteSpeechConfig"),
         ("granitemoe", "GraniteMoeConfig"),
         ("granitemoeshared", "GraniteMoeSharedConfig"),
         ("granitevision", "LlavaNextConfig"),
@@ -491,6 +492,7 @@
         ("gptj", "GPT-J"),
         ("gptsan-japanese", "GPTSAN-japanese"),
         ("granite", "Granite"),
+        ("granite_speech", "GraniteSpeech"),
         ("granitemoe", "GraniteMoeMoe"),
         ("granitemoeshared", "GraniteMoeSharedMoe"),
         ("granitevision", "LLaVA-NeXT"),

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -61,6 +61,7 @@
         ("encodec", "EncodecFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
+        ("granite_speech", "GraniteSpeechFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("imagegpt", "ImageGPTFeatureExtractor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -973,6 +973,7 @@
         ("encoder-decoder", "EncoderDecoderModel"),
         ("fsmt", "FSMTForConditionalGeneration"),
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
+        ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("led", "LEDForConditionalGeneration"),
         ("longt5", "LongT5ForConditionalGeneration"),
         ("m2m_100", "M2M100ForConditionalGeneration"),
@@ -997,6 +998,7 @@
 
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -66,6 +66,7 @@
         ("gemma3", "Gemma3Processor"),
         ("git", "GitProcessor"),
         ("got_ocr2", "GotOcr2Processor"),
+        ("granite_speech", "GraniteSpeechProcessor"),
         ("grounding-dino", "GroundingDinoProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("hubert", "Wav2Vec2Processor"),

diff --git a/src/transformers/models/granite_speech/__init__.py b/src/transformers/models/granite_speech/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granite_speech import *
+    from .feature_extraction_granite_speech import *
+    from .modeling_granite_speech import *
+    from .processing_granite_speech import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/granite_speech/configuration_granite_speech.py b/src/transformers/models/granite_speech/configuration_granite_speech.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config class for Granite Speech."""
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class GraniteSpeechEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechCTCEncoder`]. It is used to instantiate
+    a Granite Speech audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the dfefaults will yield a similar configuration to that of the audio encoder of the Granite Speech
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        input_dim (`int`, *optional*, defaults to 160):
+            Dimension of the first hidden layer of the encoder.
+        num_layers (`int`, *optional*, defaults to 10):
+            Number of encoder blocks.
+        hidden_dim (`int`, *optional*, defaults to 1024):
+            The size of the intermediate layers in the conformer encoder.
+        feedforward_mult (`int`, *optional*, defaults to 4):
+            Multiplier for the up/down projections in the encoder's feedforward layers;
+            The projections will have intermediate dim of size `hidden_dim * feedforward_mult`.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim_head (`int`, *optional*, defaults to 128):
+            Dimension of attention heads for each attention layer in the Transformer encoder.
+        output_dim (`int`, *optional*, defaults to 42):
+            Intermediate dimension of the feedforward projections in the conformer
+            to be added to every other encoder block's output.
+        context_size (`int`, *optional*, defaults to 200):
+            Context size to be used in conformer attention.
+        max_pos_emb (`int`, *optional*, defaults to 512):
+            Max pos embeds to be used in attention (shaw's relative positional encoding).
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for fully connected layers in the encoder.
+        conv_kernel_size (`int`, *optional*, defaults to 15):
+            Kernel size to be used for 1D convolution in each conformer block.
+        conv_expansion_factor (`int`, *optional*, defaults to 2):
+            Intermediate dimension to be used in conformer convolutions.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechEncoderConfig, GraniteSpeechCTCEncoder
+
+    >>> # Initializing a GraniteSpeechEncoderConfig
+    >>> configuration = GraniteSpeechEncoderConfig()
+
+    >>> # Initializing a GraniteSpeechCTCEncoder (with random weights)
+    >>> model = GraniteSpeechCTCEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite_speech_encoder"
+
+    def __init__(
+        self,
+        input_dim=160,
+        num_layers=10,
+        hidden_dim=1024,
+        feedforward_mult=4,
+        num_heads=8,
+        dim_head=128,
+        output_dim=42,
+        context_size=200,
+        max_pos_emb=512,
+        dropout=0.1,
+        conv_kernel_size=15,
+        conv_expansion_factor=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.feedforward_mult = feedforward_mult
+        self.num_heads = num_heads
+        self.dim_head = dim_head
+        self.output_dim = output_dim
+        self.context_size = context_size
+        self.dropout = dropout
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_expansion_factor = conv_expansion_factor
+        self.max_pos_emb = max_pos_emb
+
+
+class GraniteSpeechConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteSpeechForConditionalGeneration`]. It is used to instantiate an
+    Granite Speech model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `GraniteConfig`):
+            The config object or dictionary of the text backbone.
+        encoder_config (`GraniteSpeechEncoderConfig`, *optional*):
+            The config object or dictionary of the Granite Speech CTC Encoder.
+        projector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Blip2QFormerConfig`):
+            The config object or dictionary of the audio projector.
+        audio_token_index (`int`, *optional*, defaults to 49155):
+            The audio token index to encode the audio prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        has_lora_adapter (`bool`, *optional*, defaults to `True`):
+            Indicates whether or not the model has a lora adapter that should only
+            be activate when processing audio inputs.
+        downsample_rate (`int`, *optional*, defaults to 5):
+            Downsample rate for the audio feature extractor.
+        window_size (`int`, *optional*, defaults to 15):
+            Window size for the audio feature projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import GraniteSpeechConfig, GraniteSpeechForConditionalGeneration
+
+    >>> # Initializing a GraniteSpeechConfig
+    >>> configuration = GraniteSpeechConfig()
+
+    >>> # Initializing a GraniteSpeechForConditionalGeneration (with random weights)
+    >>> model = GraniteSpeechForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite_speech"
+    sub_configs = {
+        "text_config": AutoConfig,
+        "encoder_config": GraniteSpeechEncoderConfig,
+        "projector_config": AutoConfig,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        encoder_config=None,
+        projector_config=None,
+        audio_token_index=49155,
+        initializer_range=0.02,
+        has_lora_adapter=True,
+        downsample_rate=5,
+        window_size=15,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "granite"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["granite"]()
+
+        if isinstance(projector_config, dict):
+            projector_config["model_type"] = (
+                projector_config["model_type"] if "model_type" in projector_config else "blip_2_qformer"
+            )
+            projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config)
+        elif projector_config is None:
+            projector_config = CONFIG_MAPPING["blip_2_qformer"]()
+
+        if not isinstance(encoder_config, GraniteSpeechEncoderConfig):
+            encoder_config = {} if encoder_config is None else encoder_config
+            encoder_config = GraniteSpeechEncoderConfig(**encoder_config)
+
+        self.text_config = text_config
+        self.encoder_config = encoder_config
+        self.projector_config = projector_config
+        self.audio_token_index = audio_token_index
+        self.initializer_range = initializer_range
+        self.has_lora_adapter = has_lora_adapter
+        self.downsample_rate = downsample_rate
+        self.window_size = window_size
+        super().__init__(**kwargs)
+
+
+__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechConfig"]