more fixes for post-training llama4 (#37329)

winglian · ArthurZucker · commit d8f0695e84b0 · 2025-04-08T00:22:17.000+02:00
* more fixes for post-training llama4

* use target_length instead of guearded past_key_values
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
@@ -730,6 +730,7 @@ def forward(
         )
         return output if return_dict else output.to_tuple()
 
+    @torch.compiler.disable  # the operations in this method are not compilable
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -767,7 +768,7 @@ def _update_causal_mask(
         )
 
         if past_key_values is not None and past_key_values.is_compileable:
-            target_length = past_key_values.get_max_cache_shape
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = attention_mask.shape[-1] if attention_mask is not None else sequence_length
 
@@ -780,7 +781,7 @@ def _update_causal_mask(
                 attention_mask = make_flex_block_causal_mask(
                     attention_mask,
                     query_length=sequence_length,
-                    key_length=past_key_values.get_max_cache_shape(),
+                    key_length=target_length,
                     offsets=None if sequence_length != 1 else (first_cache_position, 0),
                 )
                 return attention_mask, chunked_attention_mask