[mistral] Fix FA2 attention reshape for Mistral Nemo (#32065)

* [mistral] Fix FA2 attention reshape * [run-slow] mistral
huggingface · Jul 19, 2024 · 22f888b · 22f888b
1 parent cd48553
commit 22f888b
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -387,7 +387,7 @@ def forward(
             is_causal=self.is_causal,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions: