remove hack for quantized.get_seq_length and refactor out get_max_shape to masking_utils.py

manueldeprada · manueldeprada · commit 6b6314d5f830 · 2025-07-01T10:14:45.000+02:00
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -396,32 +396,6 @@ def get_seq_length(self, layer_idx: int = 0) -> int:
             return 0
         return self.layers[layer_idx].get_seq_length()
 
-    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
-        """
-        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
-        the given layer at `layer_idx`.
-        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
-        for each layer.
-        """
-        if isinstance(self.layers[layer_idx], SlidingWindowLayer):
-            query_length = cache_position.shape[0]
-            first_cache_position = cache_position[0]
-
-            local_mask_kv_offset = torch.clamp(first_cache_position - self.config.sliding_window + 1, min=0)
-            # This is not general (see HybridChunkedCache for the whole general case), but it's what the cache returns
-            local_mask_kv_length = max(query_length, self.config.sliding_window)
-            return local_mask_kv_length, local_mask_kv_offset
-
-        full_mask_kv_offset = 0
-        if isinstance(self.layers[layer_idx], StaticLayer):
-            full_mask_kv_length = self.get_max_cache_shape()
-            return full_mask_kv_length, full_mask_kv_offset
-        else:
-            query_length = cache_position.shape[0]
-            past_seen_tokens = self.get_seq_length()
-            kv_length = query_length + past_seen_tokens
-            return kv_length, full_mask_kv_offset
-
     def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
         """Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
         backward compatibility."""
@@ -1376,15 +1350,6 @@ def get_max_cache_shape(self) -> int:
         """Returns the maximum sequence length (i.e. max capacity) of the cache object"""
         return self.self_attention_cache.get_max_cache_shape()
 
-    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
-        """
-        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
-        the given layer at `layer_idx`.
-        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
-        for each layer.
-        """
-        return self.self_attention_cache.get_mask_sizes(cache_position, layer_idx)
-
 
 class HybridCache(Cache):
     """
@@ -1647,37 +1612,6 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
                 device = self.value_cache[layer_idx].device
                 self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
 
-    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
-        """
-        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
-        the given layer at `layer_idx`.
-        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
-        for each layer.
-        """
-        if self.is_sliding[layer_idx]:
-            query_length = cache_position.shape[0]
-            first_cache_position = cache_position[0]
-
-            local_mask_kv_offset = torch.clamp(first_cache_position - self.sliding_window + 1, min=0)
-            # This is the true general case for any Cache using local attention (sliding or chunked)
-            if first_cache_position >= self.sliding_window:
-                # Here the Cache is already full
-                local_mask_kv_length = self.sliding_window + query_length - 1
-            elif (
-                first_cache_position < self.sliding_window
-                and first_cache_position + query_length > self.sliding_window
-            ):
-                # Here the Cache becomes full with the new input
-                local_mask_kv_length = first_cache_position + query_length
-            else:
-                # Here the Cache is still smaller than the local size, but we return the local size as it's static
-                local_mask_kv_length = self.sliding_window
-            return local_mask_kv_length, local_mask_kv_offset
-
-        full_mask_kv_offset = 0
-        full_mask_kv_length = self.get_max_cache_shape()
-        return full_mask_kv_length, full_mask_kv_offset
-
 
 class OffloadedHybridCache(HybridChunkedCache):
     def __init__(
@@ -1973,14 +1907,13 @@ def __init__(self, cache_config: QuantizedCacheConfig):
         self.config = cache_config
         self._quantized_key_cache: list[torch.Tensor] = []
         self._quantized_value_cache: list[torch.Tensor] = []
-        self._seen_tokens = 0
 
     def init(self, cache: "Cache", **kwargs) -> None:
         """Initialize the quantized processor and validate configuration."""
         self.config.validate()
 
         # Only compatible with DynamicCache
-        if not isinstance(cache, DynamicCache):
+        if not isinstance(cache.layers[0], DynamicLayer):
             raise ValueError("QuantizedCacheProcessor is only compatible with DynamicCache")
 
     def post_update(
@@ -1992,9 +1925,6 @@ def post_update(
         cache_kwargs: Optional[dict[str, Any]] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Apply quantization after cache update."""
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_tensors.shape[-2]
 
         if len(cache.key_cache) < layer_idx:
             raise ValueError("QuantizedCache does not support model usage where layers are skipped. Use DynamicCache.")
@@ -2194,15 +2124,6 @@ def __init__(self, cache_config: QuantizedCacheConfig) -> None:
 
         super().__init__(processor=processor)
 
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-        if len(self.key_cache) <= layer_idx:
-            return 0
-        # since we cannot get the seq_length of each layer directly and rely on `_seen_tokens` which is
-        # updated every "layer_idx" == 0, this is a hack to get the actual seq_length for the given layer_idx
-        # this part of code otherwise fails when used to verify attn_weight shape in some models
-        return self.processor._seen_tokens if layer_idx == 0 else self.processor._seen_tokens - 1
-
 
 class QuantoQuantizedCache(QuantizedCache):
     """
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -18,7 +18,7 @@
 import torch
 import torch.nn.functional as F
 
-from .cache_utils import Cache
+from .cache_utils import Cache, EncoderDecoderCache, HybridChunkedCache, SlidingWindowLayer, StaticLayer
 from .configuration_utils import PretrainedConfig
 from .utils.generic import GeneralInterface
 from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_torchdynamo_compiling
@@ -592,6 +592,59 @@ class AttentionMaskInterface(GeneralInterface):
 ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
 
 
+def get_mask_sizes(cache, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+    """
+    Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+    the given layer at `layer_idx`.
+    The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+    for each layer.
+    """
+    if isinstance(cache, HybridChunkedCache):  # not yet ported to layer-wise
+        if cache.is_sliding[layer_idx]:
+            query_length = cache_position.shape[0]
+            first_cache_position = cache_position[0]
+
+            local_mask_kv_offset = torch.clamp(first_cache_position - cache.sliding_window + 1, min=0)
+            # This is the true general case for any Cache using local attention (sliding or chunked)
+            if first_cache_position >= cache.sliding_window:
+                # Here the Cache is already full
+                local_mask_kv_length = cache.sliding_window + query_length - 1
+            elif (
+                first_cache_position < cache.sliding_window
+                and first_cache_position + query_length > cache.sliding_window
+            ):
+                # Here the Cache becomes full with the new input
+                local_mask_kv_length = first_cache_position + query_length
+            else:
+                # Here the Cache is still smaller than the local size, but we return the local size as it's static
+                local_mask_kv_length = cache.sliding_window
+            return local_mask_kv_length, local_mask_kv_offset
+
+        return cache.get_max_cache_shape(), 0
+
+    if isinstance(cache, EncoderDecoderCache):
+        cache = cache.attention_cache
+
+    if isinstance(cache.layers[layer_idx], SlidingWindowLayer):
+        query_length = cache_position.shape[0]
+        first_cache_position = cache_position[0]
+
+        local_mask_kv_offset = torch.clamp(first_cache_position - cache.config.sliding_window + 1, min=0)
+        # This is not general (see HybridChunkedCache for the whole general case), but it's what the cache returns
+        local_mask_kv_length = max(query_length, cache.config.sliding_window)
+        return local_mask_kv_length, local_mask_kv_offset
+
+    full_mask_kv_offset = 0
+    if isinstance(cache.layers[layer_idx], StaticLayer):
+        full_mask_kv_length = cache.get_max_cache_shape()
+        return full_mask_kv_length, full_mask_kv_offset
+    else:
+        query_length = cache_position.shape[0]
+        past_seen_tokens = cache_position.shape[0] if cache_position.shape[0] > 1 else cache_position[0] + 1
+        kv_length = query_length + past_seen_tokens
+        return kv_length, full_mask_kv_offset
+
+
 def _preprocess_mask_arguments(
     config: PretrainedConfig,
     input_embeds: torch.Tensor,
@@ -649,7 +702,7 @@ def _preprocess_mask_arguments(
 
     # If using a cache, it can give all informations about mask sizes based on seen tokens
     if past_key_values is not None:
-        kv_length, kv_offset = past_key_values.get_mask_sizes(cache_position, layer_idx)
+        kv_length, kv_offset = get_mask_sizes(cache_to_query, cache_position, layer_idx)
     # Otherwise, the sizes are simply the input sizes
     else:
         kv_length, kv_offset = input_embeds.shape[1], 0