huggingface
diff --git a/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bamba/modular_bamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modular_bamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bark/modeling_bark.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bark/modeling_bark.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bart/modeling_flax_bart.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bart/modeling_flax_bart.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bert/modeling_flax_bert.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bert/modeling_flax_bert.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/big_bird/modeling_flax_big_bird.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/big_bird/modeling_flax_big_bird.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/blenderbot/modeling_flax_blenderbot.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/blenderbot/modeling_flax_blenderbot.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py‎
Lines changed: 1 addition & 1 deletion
@@ -1094,7 +1094,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -1399,7 +1399,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -1140,7 +1140,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -201,7 +201,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
 
@@ -298,7 +298,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
 
@@ -274,7 +274,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
 
@@ -263,7 +263,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
 
@@ -284,7 +284,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
 
@@ -262,7 +262,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
 
@@ -273,7 +273,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.