huggingface · ehuaa · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -39,6 +39,7 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
+    is_torch_version_greater_or_equal_than_2_2_0,
     logging,
     replace_return_docstrings,
 )
@@ -1006,6 +1007,7 @@ def forward(
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
+                sliding_window=self.config.sliding_window if is_torch_version_greater_or_equal_than_2_2_0 else None,
             )
         else:
             # 4d mask is passed through the layers

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -47,6 +47,7 @@
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
+    is_torch_version_greater_or_equal_than_2_2_0,
     logging,
     replace_return_docstrings,
 )
@@ -60,6 +61,7 @@
 
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
+
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
@@ -1190,6 +1192,7 @@ def forward(
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
+                sliding_window=self.config.sliding_window if is_torch_version_greater_or_equal_than_2_2_0 else None,
             )
         else:
             # 4d mask is passed through the layers

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -188,6 +188,7 @@
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    is_torch_version_greater_or_equal_than_2_2_0,
     is_torch_xpu_available,
     is_torchaudio_available,
     is_torchdistx_available,

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -665,6 +665,10 @@ def is_flash_attn_greater_or_equal_2_10():
     return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
 
 
+def is_torch_version_greater_or_equal_than_2_2_0():
+    return version.parse(get_torch_version()) >= version.parse("2.2.0")
+
+
 def is_torchdistx_available():
     return _torchdistx_available
 

diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
@@ -483,6 +483,31 @@ def test_model_7b_logits(self):
         backend_empty_cache(torch_device)
         gc.collect()
 
+    @slow
+    @require_flash_attn
+    @require_torch_sdpa
+    def test_model_7b_logits_long_with_sdpa_and_flash2(self):
+        input_ids = [1] + [306, 338] * 2048
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="flash_attention_2"
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out = model(input_ids).logits.cpu()
+
+        input_ids = [1] + [306, 338] * 2048
-        input_ids = [1] + [306, 338] * 2048
-        input_ids = [1] + [306, 338] * 2048
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa"
+        )
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="flash_attention_2"
-        )
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        with torch.no_grad():
-            out = model(input_ids).logits.cpu()
-
-        input_ids = [1] + [306, 338] * 2048
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa"
-        )
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out = model(input_ids).logits.cpu()
+
+        input_ids = [1] + [306, 338] * 2048
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
+        )
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="flash_attention_2"
-        )
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        with torch.no_grad():
-            out = model(input_ids).logits.cpu()
-
-        input_ids = [1] + [306, 338] * 2048
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa"
-        )
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out = model(input_ids).logits.cpu()
+
+        input_ids = [1] + [306, 338] * 2048
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out1 = model(input_ids).logits.cpu()
+        torch.testing.assert_close(out.mean(-1), out1.mean(-1), atol=1e-2, rtol=1e-2)
-        torch.testing.assert_close(out.mean(-1), out1.mean(-1), atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out, out1, atol=1e-4, rtol=1e-4)
-        torch.testing.assert_close(out.mean(-1), out1.mean(-1), atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out, out1, atol=1e-4, rtol=1e-4)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
     @slow
     def test_model_7b_generation(self):
         EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big"""