remove cache configs, make CacheLayer a mixin (joaos review)

manueldeprada · manueldeprada · commit e80c68a61181 · 2025-07-04T18:59:04.000+02:00
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
@@ -134,15 +134,15 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.
 
-Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
 
 <hfoptions id="quantized-cache">
 <hfoption id="HQQQuantizedCache">
 
 For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
 
 ```py
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache
 
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
@@ -159,7 +159,7 @@ I like rock music because it's loud and energetic. It's a great way to express m
 For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
 
 ```py
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache
 
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
@@ -273,7 +273,6 @@ from transformers.cache_utils import (
     StaticCache,
     SlidingWindowCache,
     QuantoQuantizedCache,
-    QuantizedCacheConfig,
 )
 
 model_id = "meta-llama/Llama-2-7b-chat-hf"
diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md
@@ -345,12 +345,6 @@ generation_output[:2]
 [[autodoc]] Cache
     - update
 
-[[autodoc]] CacheConfig
-    - update
-
-[[autodoc]] QuantizedCacheConfig
-    - validate
-
 [[autodoc]] DynamicCache
     - update
     - get_seq_length
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
@@ -44,7 +44,6 @@
 
 logger = logging.get_logger(__name__)
 METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
-CACHE_CONFIG_MAPPING = {}
 NEED_SETUP_CACHE_CLASSES_MAPPING = {}
 QUANT_BACKEND_CLASSES_MAPPING = {}
 ALL_CACHE_IMPLEMENTATIONS = []
@@ -56,18 +55,12 @@
         HybridChunkedCache,
         OffloadedHybridCache,
         OffloadedStaticCache,
-        QuantizedCacheConfig,
         QuantoQuantizedCache,
         SlidingWindowCache,
         StaticCache,
-        StaticCacheConfig,
     )
     from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
 
-    CACHE_CONFIG_MAPPING["quantized"] = QuantizedCacheConfig
-    CACHE_CONFIG_MAPPING["static"] = StaticCacheConfig
-    CACHE_CONFIG_MAPPING["sliding_window"] = StaticCacheConfig
-    CACHE_CONFIG_MAPPING["hybrid"] = StaticCacheConfig
     NEED_SETUP_CACHE_CLASSES_MAPPING = {
         "static": StaticCache,
         "offloaded_static": OffloadedStaticCache,
@@ -188,10 +181,8 @@ class GenerationConfig(PushToHubMixin):
 
             If none is specified, we will use the default cache for the model (which is often [`DynamicCache`]). See
             our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
-        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
-            Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
-            it will be converted to its respective `CacheConfig` internally.
-            Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
+        cache_config (`dict`, *optional*, default to `None`):
+            Arguments used in the key-value cache class can be passed in `cache_config`.
         return_legacy_cache (`bool`, *optional*, default to `True`):
             Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
 
@@ -406,10 +397,6 @@ def __init__(self, **kwargs):
         self.use_cache = kwargs.pop("use_cache", True)
         self.cache_implementation = kwargs.pop("cache_implementation", None)
         self.cache_config = kwargs.pop("cache_config", None)
-        if self.cache_implementation is not None and self.cache_implementation in CACHE_CONFIG_MAPPING:
-            cache_config_class = CACHE_CONFIG_MAPPING[self.cache_implementation]
-            if isinstance(self.cache_config, dict):
-                self.cache_config = cache_config_class.from_dict(self.cache_config)
         self.return_legacy_cache = kwargs.pop("return_legacy_cache", None)
         self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
 
@@ -611,17 +598,6 @@ def validate(self, strict=False):
                 f"Invalid `cache_implementation` ({self.cache_implementation}). Choose one of: "
                 f"{ALL_CACHE_IMPLEMENTATIONS}"
             )
-        if self.cache_config is not None:
-            cache_class = CACHE_CONFIG_MAPPING.get(self.cache_implementation)
-            if cache_class is None:
-                raise ValueError(
-                    "You provided a `cache_config` but the cache implementation you are using "
-                    f"({self.cache_implementation}) does not require any config. Make sure to use the "
-                    "correct cache implementation matching your cache config."
-                )
-            if not isinstance(self.cache_config, cache_class):
-                self.cache_config = cache_class.from_dict(self.cache_config)
-            self.cache_config.validate()
         # 1.3. Performance attributes
         if self.compile_config is not None and not isinstance(self.compile_config, CompileConfig):
             raise ValueError(
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -35,7 +35,6 @@
     HybridChunkedCache,
     OffloadedCache,
     OffloadedHybridCache,
-    QuantizedCacheConfig,
 )
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import (
@@ -2077,22 +2076,22 @@ def _prepare_cache_for_generation(
                 cache_config = (
                     generation_config.cache_config
                     if generation_config.cache_config is not None
-                    else QuantizedCacheConfig()
+                    else {"backend": "quanto"}
                 )
-                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
+                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config["backend"]]
 
-                if cache_config.backend == "quanto" and not is_optimum_quanto_available():
+                if cache_config["backend"] == "quanto" and not is_optimum_quanto_available():
                     raise ImportError(
                         "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. "
                         "Please install it via  with `pip install optimum-quanto`"
                     )
-                elif cache_config.backend == "HQQ" and not is_hqq_available():
+                elif cache_config["backend"] == "HQQ" and not is_hqq_available():
                     raise ImportError(
                         "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
                         "Please install it via  with `pip install hqq`"
                     )
 
-                model_kwargs[cache_name] = cache_class(cache_config)
+                model_kwargs[cache_name] = cache_class(**cache_config)
             elif generation_config.cache_implementation == "offloaded":
                 model_kwargs[cache_name] = OffloadedCache()
             elif generation_config.cache_implementation == "dynamic":
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
@@ -276,9 +276,9 @@ def __init__(self, model: PreTrainedModel):
         self.model = model
         self.static_cache = StaticCache(
             model_config=self.model.config,
-            max_batch_size=self.model.generation_config.cache_config.batch_size,
-            max_cache_len=self.model.generation_config.cache_config.max_cache_len,
-            device=self.model.generation_config.cache_config.device,
+            max_batch_size=self.model.generation_config.cache_config.get("batch_size"),
+            max_cache_len=self.model.generation_config.cache_config.get("max_cache_len"),
+            device=self.model.generation_config.cache_config.get("device"),
             dtype=self.model.dtype,
         )
         for i in range(len(self.static_cache)):
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -693,7 +693,7 @@ def create_causal_mask(
     """
     # If we have an HybridCache structure, here we want to create the mask for the full layers
     is_sliding = []
-    if past_key_values is not None:
+    if past_key_values is not None and past_key_values.layers is not None:
         is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
     layer_idx = is_sliding.index(True) if True in is_sliding else 0
 
@@ -775,7 +775,7 @@ def create_sliding_window_causal_mask(
     """
     # If we have an HybridCache structure, here we want to create the mask for the sliding layers
     is_sliding = []
-    if past_key_values is not None:
+    if past_key_values is not None and past_key_values.layers is not None:
         is_sliding = [getattr(layer, "is_sliding", False) for layer in past_key_values.layers]
     layer_idx = is_sliding.index(True) if True in is_sliding else 0
 
diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py
@@ -146,6 +146,9 @@ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
     def __len__(self):
         return len(self.key_cache)
 
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
     # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.update
     def update(
         self,
diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -150,6 +150,9 @@ def __init__(
     def __len__(self):
         return len(self.key_cache)
 
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
     def update(
         self,
         key_states: torch.Tensor,
diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py
@@ -36,7 +36,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import AutoTokenizer, FalconH1ForCausalLM, FalconH1Model
+    from transformers import AutoTokenizer, Cache, FalconH1ForCausalLM, FalconH1Model
     from transformers.models.falcon_h1.modeling_falcon_h1 import (
         FalconHybridMambaAttentionDynamicCache,
     )
@@ -270,6 +270,43 @@ class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         {"feature-extraction": FalconH1Model, "text-generation": FalconH1ForCausalLM} if is_torch_available() else {}
     )
 
+    def _check_past_key_values_for_generate(self, batch_size, decoder_past_key_values, cache_length, config):
+        self.assertIsInstance(decoder_past_key_values, (tuple, Cache))
+
+        # (batch, head, seq_length, head_features)
+        expected_shape = (
+            batch_size,
+            config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads,
+            cache_length,
+            config.hidden_size // config.num_attention_heads,
+        )
+
+        if isinstance(decoder_past_key_values, Cache):
+            self.assertListEqual(
+                [key_tensor.shape for key_tensor in decoder_past_key_values.key_cache],
+                [expected_shape] * len(decoder_past_key_values.key_cache),
+            )
+            self.assertListEqual(
+                [value_cache.shape for value_cache in decoder_past_key_values.value_cache],
+                [expected_shape] * len(decoder_past_key_values.value_cache),
+            )
+
+        # Legacy cache format checks. This branch should be removed when all models use `Cache` by default
+        else:
+            self.assertListEqual(
+                [isinstance(iter_past_key_values, tuple) for iter_past_key_values in decoder_past_key_values],
+                [True] * len(decoder_past_key_values),
+            )
+            # check shape key, value
+            self.assertListEqual(
+                [layer_past_key_values[0].shape for layer_past_key_values in decoder_past_key_values],
+                [expected_shape] * len(decoder_past_key_values),
+            )
+            self.assertListEqual(
+                [layer_past_key_values[1].shape for layer_past_key_values in decoder_past_key_values],
+                [expected_shape] * len(decoder_past_key_values),
+            )
+
     def setUp(self):
         self.model_tester = FalconH1ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=FalconH1Config, hidden_size=64)
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
@@ -813,8 +813,8 @@ def test_static_cache_exportability(self):
         self.assertEqual(model.generation_config.cache_implementation, cache_implementation)
         self.assertEqual(model.generation_config.max_length, max_cache_len)
         self.assertTrue(model.generation_config.cache_config is not None)
-        self.assertEqual(model.generation_config.cache_config.batch_size, batch_size)
-        self.assertEqual(model.generation_config.cache_config.max_cache_len, max_cache_len)
+        self.assertEqual(model.generation_config.cache_config.get("batch_size"), batch_size)
+        self.assertEqual(model.generation_config.cache_config.get("max_cache_len"), max_cache_len)
 
         exported_program = convert_and_export_with_cache(model)