Update cache_utils.py

Cyrilvallez · Cyrilvallez · commit 59a329609408 · 2025-07-31T13:19:37.000+02:00
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1236,7 +1236,44 @@ def __init__(self, max_cache_len: int, config: PretrainedConfig):
         self.offloading = True
 
 
-class QuantoQuantizedCache(Cache):
+class QuantizedCache(Cache):
+    """
+    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
+    It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.
+    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the
+    original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The
+    quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper.
+    It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and
+    Value in original precision states as a list of tensors, one for each layer. The size of each tensor
+    is `[batch_size, num_heads, seq_len - residual_length, head_dim]`.
+    See `Cache` for details on common methods that are implemented by all cache classes.
+    """
+
+    def __init__(
+        self,
+        backend: str,
+        config: PretrainedConfig,
+        nbits: int = 4,
+        axis_key: int = 0,
+        axis_value: int = 0,
+        q_group_size: int = 64,
+        residual_length: int = 128,
+    ):
+        if backend == "quanto":
+            layer_class = QuantoQuantizedLayer
+        elif backend == "hqq":
+            layer_class = HQQQuantizedLayer
+        else:
+            raise ValueError(f"Unknown quantization backend `{backend}`")
+
+        layers = [
+            layer_class(nbits, axis_key, axis_value, q_group_size, residual_length)
+            for _ in range(config.num_hidden_layers)
+        ]
+        super().__init__(layers=layers)
+
+
+class QuantoQuantizedCache(QuantizedCache):
     """
     A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://huggingface.co/papers/2402.02750).
     It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.
@@ -1276,14 +1313,10 @@ def __init__(
         q_group_size: int = 64,
         residual_length: int = 128,
     ):
-        layers = [
-            QuantoQuantizedLayer(nbits, axis_key, axis_value, q_group_size, residual_length)
-            for _ in range(config.num_hidden_layers)
-        ]
-        super().__init__(layers=layers)
+        super().__init__("quanto", config, nbits, axis_key, axis_value, q_group_size, residual_length)
 
 
-class HQQQuantizedCache(Cache):
+class HQQQuantizedCache(QuantizedCache):
     """
     A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
     It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.
@@ -1323,11 +1356,7 @@ def __init__(
         q_group_size: int = 64,
         residual_length: int = 128,
     ):
-        layers = [
-            HQQQuantizedLayer(nbits, axis_key, axis_value, q_group_size, residual_length)
-            for _ in range(config.num_hidden_layers)
-        ]
-        super().__init__(layers=layers)
+        super().__init__("hqq", config, nbits, axis_key, axis_value, q_group_size, residual_length)
 
 
 class EncoderDecoderCache(Cache):