huggingface · BenjaminBossan · Aug 21, 2025 · Aug 12, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
@@ -34,7 +34,7 @@
 from safetensors import safe_open
 from safetensors.torch import save_file as safe_save_file
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers import Cache, DynamicCache, EncoderDecoderCache, HybridCache, PreTrainedModel
+from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel
 from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
 from transformers.utils import PushToHubMixin
 
@@ -749,22 +749,34 @@ def get_prompt(
                 post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
                 past_key_values = post_process_fn(past_key_values)
             elif ("gemma2" in model_type) or ("gemma3_text" in model_type):
+                # TODO: remove this logic once transformers < 4.56 is dropped
+                transformers_lt_4_56 = packaging.version.parse(transformers.__version__) < packaging.version.parse(
+                    "4.56.0.dev0"
+                )
                 # Gemma2 and Gemma3 only support HybridCache (which does not have the from_legacy_cache method)
-                if max_cache_len is None:
+                if transformers_lt_4_56 and ((max_cache_len is None) or (max_cache_len == -1)):
                     raise ValueError(
-                        "max_cache_len is None but it should have been passed. Something went wrong, please open an "
+                        "max_cache_len is missing but it should have been passed. Something went wrong, please open an "
                         "issue on GitHub with a reproducer: https://github.com/huggingface/peft/issues"
                     )
                 base_config = base_model.config
                 if hasattr(base_config, "get_text_config"):
                     base_config = base_config.get_text_config()
-                new_cache = HybridCache(
-                    base_config,
-                    max_batch_size=batch_size,
-                    max_cache_len=max_cache_len,
-                    dtype=past_key_values[0].dtype,
-                    device=past_key_values[0].device,
-                )
+                if transformers_lt_4_56:
+                    # HybridCache is deprecated, and will be removed in 4.60.0
+                    # see https://github.com/huggingface/transformers/pull/40276
+                    from transformers import HybridCache
+
+                    new_cache = HybridCache(
+                        config=base_config,
+                        max_batch_size=batch_size,
+                        max_cache_len=max_cache_len,
+                        dtype=past_key_values[0].dtype,
+                        device=past_key_values[0].device,
+                    )
+                else:
+                    # transformers 4.56+ uses DynamicCache for gemma
+                    new_cache = DynamicCache(config=base_config)
                 cache_position = torch.arange(peft_config.num_virtual_tokens, device=past_key_values[0].device)
                 for layer_idx in range(peft_config.num_layers):
                     key_states, value_states = past_key_values[0][layer_idx], past_key_values[1][layer_idx]
@@ -2068,15 +2080,18 @@ def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor]
                 )
                 kwargs["token_type_ids"] = None
 
+            cache: transformers.Cache | None = model_kwargs.get("past_key_values", None)
             # no past_key_values or past_key_values empty cache
-            requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or (
-                isinstance(model_kwargs["past_key_values"], transformers.Cache)
-                and not model_kwargs["past_key_values"].get_seq_length()
+            requires_prompt_injection = (cache is None) or (
+                isinstance(cache, transformers.Cache) and not cache.get_seq_length()
             )
 
             if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING:
-                # some archs require max_cache_len to re-initialize the cache
-                max_cache_len = getattr(model_kwargs.get("past_key_values", None), "max_cache_len", None)
+                # some archs require max_cache_len to re-initialize the cache, but DynamicCache has no max len
+                if isinstance(cache, transformers.Cache) and not isinstance(cache, transformers.DynamicCache):
+                    max_cache_len = cache.max_cache_len
+                else:
+                    max_cache_len = -1  # -1 means no max length
                 new_past_key_values = self.get_prompt(
                     batch_size=model_kwargs["input_ids"].shape[0],
                     max_cache_len=max_cache_len,