huggingface · ArthurZucker · Apr 7, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 7, 2025
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
@@ -31,6 +31,7 @@
 import torch
 
 from ..utils import is_torch_flex_attn_available
+from ..utils.import_utils import _torch_version
 
 
 if is_torch_flex_attn_available():
@@ -60,8 +61,16 @@ def __init__(self):
         """
         Initialize or update the singleton instance.
         """
-        if self._is_flex_compiled is False:
-            self._compiled_flex_attention = torch.compile(flex_attention, backend="inductor")
+        if not self._is_flex_compiled:
+            # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
+            # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
+            # see https://github.com/pytorch/pytorch/issues/146260 for training
+            if _torch_version == "2.6.0":
 def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False): 
 def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False): 
+                self._compiled_flex_attention = torch.compile(
+                    flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
+                )
+            else:
+                self._compiled_flex_attention = torch.compile(flex_attention, dynamic=False)
             self._is_flex_compiled = True
 
     def __call__(self):