From 2d757002fc25e33b0c8b949340f3ef030b1711b4 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:06:39 +0200 Subject: [PATCH] red-ci on main, fix copies (#33356) * fix copies * ??? --- src/transformers/models/camembert/modeling_camembert.py | 6 +++--- src/transformers/models/roberta/modeling_roberta.py | 6 +++--- src/transformers/models/xlm_roberta/modeling_xlm_roberta.py | 6 +++--- .../models/xlm_roberta_xl/modeling_xlm_roberta_xl.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index 03a60a2856be2e..0d12c800c156f7 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -916,7 +916,7 @@ def forward( encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: @@ -986,7 +986,7 @@ def forward( ) # Expand the attention mask - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] if self.config.is_decoder: @@ -1013,7 +1013,7 @@ def forward( if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f1f83147527dfb..bbf16ec039b4cd 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -857,7 +857,7 @@ def forward( encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: @@ -927,7 +927,7 @@ def forward( ) # Expand the attention mask - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] if self.config.is_decoder: @@ -954,7 +954,7 @@ def forward( if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 3ac94e75f92fe4..2adae33fbd50a8 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -858,7 +858,7 @@ def forward( encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: @@ -928,7 +928,7 @@ def forward( ) # Expand the attention mask - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] if self.config.is_decoder: @@ -955,7 +955,7 @@ def forward( if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index f66a32291794f1..f86abf823e9026 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -839,7 +839,7 @@ def forward( encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: @@ -909,7 +909,7 @@ def forward( ) # Expand the attention mask - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] if self.config.is_decoder: @@ -936,7 +936,7 @@ def forward( if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - if use_sdpa_attention_masks: + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: # Expand the attention mask for SDPA. # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(