Skip to content

Commit af9b2ea

Browse files
authored
chore: fix typos in language models (#36586)
* chore: fix typos in language models * chore: fix typos in mistral model * chore: fix model copy from issue * chore: fix model copy from issue * chore: fix model copy from issue * chore: fix model copy from issue * chore: fix model copy from issue
1 parent a929c46 commit af9b2ea

File tree

115 files changed

+144
-144
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+144
-144
lines changed

src/transformers/models/aria/modeling_aria.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
10941094
dtype (`torch.dtype`):
10951095
The dtype to use for the 4D attention mask.
10961096
device (`torch.device`):
1097-
The device to plcae the 4D attention mask on.
1097+
The device to place the 4D attention mask on.
10981098
cache_position (`torch.Tensor`):
10991099
Indices depicting the position of the input sequence tokens in the sequence.
11001100
batch_size (`torch.Tensor`):

src/transformers/models/bamba/modeling_bamba.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1399,7 +1399,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
13991399
dtype (`torch.dtype`):
14001400
The dtype to use for the 4D attention mask.
14011401
device (`torch.device`):
1402-
The device to plcae the 4D attention mask on.
1402+
The device to place the 4D attention mask on.
14031403
cache_position (`torch.Tensor`):
14041404
Indices depicting the position of the input sequence tokens in the sequence.
14051405
batch_size (`torch.Tensor`):

src/transformers/models/bamba/modular_bamba.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
11401140
dtype (`torch.dtype`):
11411141
The dtype to use for the 4D attention mask.
11421142
device (`torch.device`):
1143-
The device to plcae the 4D attention mask on.
1143+
The device to place the 4D attention mask on.
11441144
cache_position (`torch.Tensor`):
11451145
Indices depicting the position of the input sequence tokens in the sequence.
11461146
batch_size (`torch.Tensor`):

src/transformers/models/bark/modeling_bark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def __init__(self, *args, **kwargs):
201201
super().__init__(*args, **kwargs)
202202

203203
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
204-
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
204+
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
205205
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
206206
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
207207

src/transformers/models/bart/modeling_bart.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def __init__(self, *args, **kwargs):
298298
super().__init__(*args, **kwargs)
299299

300300
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
301-
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
301+
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
302302
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
303303
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
304304

src/transformers/models/bart/modeling_flax_bart.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def _merge_heads(self, hidden_states):
274274
def _concatenate_to_cache(self, key, value, query, attention_mask):
275275
"""
276276
This function takes projected key, value states from a single input token and concatenates the states to cached
277-
states from previous steps. This function is slighly adapted from the official Flax repository:
277+
states from previous steps. This function is slightly adapted from the official Flax repository:
278278
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
279279
"""
280280
# detect if we're initializing by absence of existing cache data.

src/transformers/models/bert/modeling_flax_bert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def _merge_heads(self, hidden_states):
263263
def _concatenate_to_cache(self, key, value, query, attention_mask):
264264
"""
265265
This function takes projected key, value states from a single input token and concatenates the states to cached
266-
states from previous steps. This function is slighly adapted from the official Flax repository:
266+
states from previous steps. This function is slightly adapted from the official Flax repository:
267267
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
268268
"""
269269
# detect if we're initializing by absence of existing cache data.

src/transformers/models/big_bird/modeling_flax_big_bird.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def _merge_heads(self, hidden_states):
284284
def _concatenate_to_cache(self, key, value, query, attention_mask):
285285
"""
286286
This function takes projected key, value states from a single input token and concatenates the states to cached
287-
states from previous steps. This function is slighly adapted from the official Flax repository:
287+
states from previous steps. This function is slightly adapted from the official Flax repository:
288288
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
289289
"""
290290
# detect if we're initializing by absence of existing cache data.

src/transformers/models/blenderbot/modeling_flax_blenderbot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def _merge_heads(self, hidden_states):
262262
def _concatenate_to_cache(self, key, value, query, attention_mask):
263263
"""
264264
This function takes projected key, value states from a single input token and concatenates the states to cached
265-
states from previous steps. This function is slighly adapted from the official Flax repository:
265+
states from previous steps. This function is slightly adapted from the official Flax repository:
266266
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
267267
"""
268268
# detect if we're initializing by absence of existing cache data.

src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def _merge_heads(self, hidden_states):
273273
def _concatenate_to_cache(self, key, value, query, attention_mask):
274274
"""
275275
This function takes projected key, value states from a single input token and concatenates the states to cached
276-
states from previous steps. This function is slighly adapted from the official Flax repository:
276+
states from previous steps. This function is slightly adapted from the official Flax repository:
277277
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
278278
"""
279279
# detect if we're initializing by absence of existing cache data.

0 commit comments

Comments
 (0)