huggingface · ehuaa · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -39,6 +39,7 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
+    is_torch_version_greater_or_equal_than_2_2_0,
     logging,
     replace_return_docstrings,
 )
@@ -1006,6 +1007,7 @@ def forward(
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
+                sliding_window=self.config.sliding_window if is_torch_version_greater_or_equal_than_2_2_0 else None,
             )
         else:
             # 4d mask is passed through the layers

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -47,6 +47,7 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
+    is_torch_version_greater_or_equal_than_2_2_0,
     logging,
     replace_return_docstrings,
 )
@@ -60,6 +61,7 @@
 
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
+
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
@@ -1190,6 +1192,7 @@ def forward(
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
+                sliding_window=self.config.sliding_window if is_torch_version_greater_or_equal_than_2_2_0 else None,
             )
         else:
             # 4d mask is passed through the layers

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -188,6 +188,7 @@
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    is_torch_version_greater_or_equal_than_2_2_0,
     is_torch_xpu_available,
     is_torchaudio_available,
     is_torchdistx_available,

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -665,6 +665,10 @@ def is_flash_attn_greater_or_equal_2_10():
     return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
 
 
+def is_torch_version_greater_or_equal_than_2_2_0():
+    return version.parse(get_torch_version()) >= version.parse("2.2.0")
+
+
 def is_torchdistx_available():
     return _torchdistx_available