huggingface
diff --git a/‎src/transformers/cache_utils.py‎
Lines changed: 65 additions & 0 deletions b/‎src/transformers/cache_utils.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/transformers/models/gemma3p5/configuration_gemma3p5.py‎
Lines changed: 102 additions & 100 deletions b/‎src/transformers/models/gemma3p5/configuration_gemma3p5.py‎
Lines changed: 102 additions & 100 deletions
@@ -2445,3 +2445,68 @@ def _prefetch_layer_in_context(self, layer_idx: int) -> None:
 
         self._device_key_cache[layer_idx & 1].copy_(self.key_cache[layer_idx], non_blocking=True)
         self._device_value_cache[layer_idx & 1].copy_(self.value_cache[layer_idx], non_blocking=True)
+
+
+@dataclass
+class KVStore:
+    """KV cache container for a single layer.
+
+    For holding the k, v values during training for sharing them across depth.
+    Unlike AttenionKVCache, it does not split the cache into prefill and
+    generation segments or implements a cursor.
+    This is not stacked in a layerscan.
+
+    Shapes:
+        k: [batch_size, seq_len, num_kv_heads, head_dim]
+        v: [batch_size, seq_len, num_kv_heads, head_dim]
+    """
+
+    def __init__(self, x: torch.Tensor, num_kv_heads: int, head_dim: int,) -> None:
+        """
+        Shape of x: [batch_size, seq_len, head_dim]
+
+        Returns:
+            A KVStore with zero initialized 'k' and 'v', each with a shape of [batch_size, seq_len, num_kv_heads, head_dim]
+        """
+        b, t, _ = x.shape
+        self.k = torch.zeros((b, t, num_kv_heads, head_dim), dtype=x.dtype, device=x.device)
+        self.v = torch.zeros((b, t, num_kv_heads, head_dim), dtype=x.dtype, device=x.device)
+
+
+@dataclass
+class BlockKVStore:
+    """
+    Stores 2 KVStore objects:
+        - kv_local: for local/sliding window attention
+        - kv_global: for global attention
+    """
+
+    def __init__(
+        self,
+        kv_local: Optional[KVStore] = None,
+        kv_global: Optional[KVStore] = None,
+        x: Optional[torch.Tensor] = None,
+        num_kv_heads: Optional[int] = None,
+        num_global_kv_heads: Optional[int] = None,
+        head_dim: Optional[int] = None,
+    ):
+        if kv_local is None:
+            self.kv_local = KVStore(x, num_kv_heads, head_dim)
+        else:
+            self.kv_local = kv_local
+
+        if kv_global is None:
+            self.kv_global = KVStore(x, num_global_kv_heads, head_dim)
+        else:
+            self.kv_global = kv_global
+
+    def update_kv_store(self, is_sliding: bool, new_kv_store: KVStore) -> "BlockKVStore":
+        """Return a new BlockKVStore with either the local or global KVStore replaced."""
+        if is_sliding:
+            return BlockKVStore(kv_local=new_kv_store, kv_global=self.kv_global)
+        else:
+            return BlockKVStore(kv_local=self.kv_local, kv_global=new_kv_store)
+
+    def get_kv_store(self, is_sliding: bool) -> Optional[KVStore]:
+        """Return the relevant KVStore if sharing is enabled. Otherwise return None. """
+        return self.kv_local if is_sliding else self.kv_global
@@ -19,25 +19,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+import fractions
+from collections.abc import Sequence
+from typing import Any, Optional, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...modeling_rope_utils import rope_config_validation
 from ...utils import logging
-from ..siglip import SiglipVisionConfig
+from ..gemma3 import Gemma3TextConfig
 
 
 logger = logging.get_logger(__name__)
 
 
-class Gemma3p5TextConfig(PretrainedConfig):
+class Gemma3p5TextConfig(Gemma3TextConfig):
     r"""
     This is the configuration class to store the configuration of a [`Gemma3p5TextModel`]. It is used to instantiate an Gemma3p5Text
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the Gemma3p5Text-7B.
-    e.g. [google/gemma3p5_text-7b](https://huggingface.co/google/gemma3p5_text-7b)
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
+    defaults will yield a similar configuration to that of the Gemma3p5Text-4B.
+    e.g. [google/gemma3p5_text-4b](https://huggingface.co/google/gemma3p5_text-4b) #TODO (sindhuraghuram): Update the link here
+    Configuration objects inherit from [`Gemma3TextConfig`] and can be used to control the model outputs. Read the
+    documentation from [`Gemma3TextConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 262208):
             Vocabulary size of the Gemma3p5Text model. Defines the number of different tokens that can be represented by the
@@ -134,105 +135,100 @@ class Gemma3p5TextConfig(PretrainedConfig):
                     Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
         rope_local_base_freq (float, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
+        sliding_window_pattern (`int`, *optional*, defaults to 5):
             Pattern for the sliding window attention.
 
+    TODO (sindhuraghuram): Update the list of configs
+
     ```python
     >>> from transformers import Gemma3p5TextModel, Gemma3p5TextConfig
-    >>> # Initializing a Gemma3p5Text gemma3p5_text-7b style configuration
+    >>> # Initializing a Gemma3p5Text gemma3p5_text-4b style configuration
     >>> configuration = Gemma3p5TextConfig()
-    >>> # Initializing a model from the gemma3p5_text-7b style configuration
+    >>> # Initializing a model from the gemma3p5_text-4b style configuration
     >>> model = Gemma3p5TextModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```
         rope_local_base_freq (float, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
+        sliding_window_pattern (`int`, *optional*, defaults to 5):
             Pattern for the sliding window attention.
     """
 
     model_type = "gemma3p5_text"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
 
     def __init__(
         self,
-        vocab_size=262_208,
-        hidden_size=2304,
-        intermediate_size=9216,
-        num_hidden_layers=26,
-        num_attention_heads=8,
-        num_key_value_heads=4,
-        head_dim=256,
-        hidden_activation="gelu_pytorch_tanh",
-        max_position_embeddings=131_072,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        eos_token_id=1,
-        bos_token_id=2,
-        tie_word_embeddings=True,
-        rope_theta=1_000_000.0,
-        attention_bias=False,
-        attention_dropout=0.0,
-        query_pre_attn_scalar=256,
-        sliding_window=4096,
-        final_logit_softcapping=None,
-        attn_logit_softcapping=None,
-        cache_implementation="hybrid",
-        rope_scaling=None,
-        rope_local_base_freq=10_000.0,
-        sliding_window_pattern=6,
-        **kwargs,
+        vocab_size: int = 262_144,
+        hidden_size: int = 2048,
+        hidden_size_per_layer_input: int = 256,
+        num_hidden_layers: int = 35,
+        sliding_window: int = 512,
+        intermediate_size: int = 16_384,
+        num_key_value_heads: int = 2,
+        rope_theta: float = 1_000_000.0,
+        rope_local_base_freq: float = 10_000.0,
+        sliding_window_pattern: int = 5,
+        final_logit_softcapping: float = 30.0,
+        altup_active_idx: int = 0,
+        altup_coef_clip: float = 120.0,
+        altup_lr_multiplier: float = 1.0,
+        altup_num_inputs: int = 8,
+        altup_num_modalities: int = 4,
+        frac_shared_layers: Union[float, fractions.Fraction] = 0.5,
+        laurel_rank: int = 64,
+        activation_sparsity_pattern: Optional[Sequence[float]] = None,
+        **super_kwargs,
     ):
+        super_kwargs["rope_scaling"] = None
+
         super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_key_value_heads=num_key_value_heads,
+            intermediate_size=intermediate_size,
+            rope_theta=rope_theta,
+            rope_local_base_freq=rope_local_base_freq,
+            sliding_window=sliding_window,
+            sliding_window_pattern=sliding_window_pattern,
+            final_logit_softcapping=final_logit_softcapping,
+            **super_kwargs,
         )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.head_dim = head_dim
-        self.num_key_value_heads = num_key_value_heads
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.hidden_activation = hidden_activation
-        self.query_pre_attn_scalar = query_pre_attn_scalar
-        self.sliding_window = sliding_window
-        self.final_logit_softcapping = final_logit_softcapping
-        self.attn_logit_softcapping = attn_logit_softcapping
-        self.cache_implementation = cache_implementation
-
-        self.rope_local_base_freq = rope_local_base_freq
-        # For configuring HybridCache to work with 5:1 attention pattern
-        self.sliding_window_pattern = sliding_window_pattern
-        self.rope_scaling = rope_scaling
-        rope_config_validation(self)
+        self.hidden_size_per_layer_input = hidden_size_per_layer_input
+
+        self.altup_active_idx = altup_active_idx
+        self.altup_coef_clip = altup_coef_clip
+        self.altup_lr_multiplier = altup_lr_multiplier
+        self.altup_num_inputs = altup_num_inputs
+        self.altup_num_modalities = altup_num_modalities
+
+        self.laurel_rank = laurel_rank
+
+        self.frac_shared_layers = frac_shared_layers
+        if (
+            activation_sparsity_pattern is not None
+            and (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers
+        ):
+            raise ValueError(
+                "activation_sparsity_pattern must have an explicit activation sparsity value for every layer."
+                f"Expected {num_hidden_layers} values but got {len_asp}."
+            )
+        self.activation_sparsity_pattern = activation_sparsity_pattern
+
+
+class Gemma3p5AudioConfig(PretrainedConfig):
+    model_type = "gemma3p5"
+
+    def __init__(self):
+        pass
+
+
+class Gemma3p5VisionConfig(PretrainedConfig):
+    model_type = "gemma3p5"
+
+    def __init__(self):
+        pass
 
 
 class Gemma3p5Config(PretrainedConfig):
@@ -287,37 +283,43 @@ class Gemma3p5Config(PretrainedConfig):
     model_type = "gemma3p5"
     sub_configs = {
         "text_config": Gemma3p5TextConfig,
-        "vision_config": SiglipVisionConfig,
+        "vision_config": Gemma3p5VisionConfig,
+        "audio_config": Gemma3p5AudioConfig,
     }
 
     def __init__(
         self,
-        text_config: Optional[Gemma3p5TextConfig] = None,
-        vision_config: Optional[SiglipVisionConfig] = None,
+        text_config: Optional[Union[Gemma3p5TextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[Gemma3p5VisionConfig, dict[str, Any]]] = None,
+        audio_config: Optional[Union[Gemma3p5AudioConfig, dict[str, Any]]] = None,
         mm_tokens_per_image: int = 256,
         boi_token_index: int = 255_999,
         eoi_token_index: int = 256_000,
         image_token_index: int = 262_144,
         initializer_range: float = 0.02,
         **kwargs,
     ):
-        if text_config is None:
-            text_config = Gemma3p5TextConfig()
-            logger.info("text_config is None, using default Gemma3p5TextConfig vision config.")
-        elif isinstance(text_config, dict):
+        if isinstance(text_config, dict):
             text_config = Gemma3p5TextConfig(**text_config)
+        elif text_config is None:
+            text_config = Gemma3p5TextConfig()
+            logger.info("text_config is None. Using default Gemma3p5TextConfig.")
 
         if isinstance(vision_config, dict):
-            vision_config = SiglipVisionConfig(**vision_config)
-        else:
-            vision_config = SiglipVisionConfig()
-            logger.info(
-                "vision_config is None or incompatible with Gemma3p5VisionConfig intialization. Gemma3p5 will be limited "
-                "to text tasks."
-            )
+            vision_config = Gemma3p5VisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = Gemma3p5VisionConfig()
+            logger.info("vision_config is None. Using default Gemma3p5VisionConfig.")
+
+        if isinstance(audio_config, dict):
+            audio_config = Gemma3p5AudioConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Gemma3p5AudioConfig()
+            logger.info("audio_config is None. Using default Gemma3p5AudioConfig.")
 
         self.text_config = text_config
         self.vision_config = vision_config
+        self.audio_config = audio_config
         self.mm_tokens_per_image = mm_tokens_per_image
         self.boi_token_index = boi_token_index
         self.eoi_token_index = eoi_token_index