fix

manueldeprada · manueldeprada · commit c030aa2784ec · 2025-07-02T16:48:23.000+02:00
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -177,16 +177,19 @@ class Cache(CacheBase):
     Parameters:
         model_config (`PretrainedConfig`):
             Model configuration for shape/device info.
-        processor (`CacheProcessor`, *optional*):
+        cache_processor (`CacheProcessor`, *optional*):
             Cache processor to apply (e.g., quantization, offloading).
-        Additional arguments for cache configuration:
-            - `max_batch_size`/`batch_size` (`int`): Maximum batch size for static caches
-            - `max_cache_len` (`int`): Maximum sequence length. For hybrid caches:
-                * SlidingWindowLayers: clamped to `min(sliding_window, max_cache_len)`
-                * StaticLayers: uses full `max_cache_len`
-            - `device` (`torch.device`): Device for cache tensors
-            - `dtype` (`torch.dtype`): Data type for cache tensors
-            - `layer_device_map` (`dict[int, Union[str, torch.device]]`): Per-layer device mapping
+        layer_classes (`list[type[CacheLayer]]`, *optional*):
+            List of layer classes to use for the cache.
+
+    Additional arguments for cache configuration:
+        - `max_batch_size`/`batch_size` (`int`): Maximum batch size for static caches
+        - `max_cache_len` (`int`): Maximum sequence length. For hybrid caches:
+            * SlidingWindowLayers: clamped to `min(sliding_window, max_cache_len)`
+            * StaticLayers: uses full `max_cache_len`
+        - `device` (`torch.device`): Device for cache tensors
+        - `dtype` (`torch.dtype`): Data type for cache tensors
+        - `layer_device_map` (`dict[int, Union[str, torch.device]]`): Per-layer device mapping
 
     Note for hybrid caches (blocks of (StaticLayer, ..., SlidingWindowLayer) repeated across layers):
         - Requires `model_config.sliding_window` to be set
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -692,7 +692,9 @@ def create_causal_mask(
             useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
     """
     # If we have an HybridCache structure, here we want to create the mask for the full layers
-    is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
+    is_sliding = []
+    if past_key_values is not None:
+        is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
     layer_idx = is_sliding.index(True) if True in is_sliding else 0
 
     early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
@@ -772,7 +774,9 @@ def create_sliding_window_causal_mask(
             useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
     """
     # If we have an HybridCache structure, here we want to create the mask for the sliding layers
-    is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
+    is_sliding = []
+    if past_key_values is not None:
+        is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
     layer_idx = is_sliding.index(True) if True in is_sliding else 0
 
     early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
@@ -857,7 +861,9 @@ def create_chunked_causal_mask(
             useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
     """
     # If we have an HybridCache structure, here we want to create the mask for the sliding layers
-    is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
+    is_sliding = []
+    if past_key_values is not None:
+        is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
     layer_idx = is_sliding.index(True) if True in is_sliding else 0
 
     early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(