diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 011eee1f24b54b..7a8a1c04910502 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -128,12 +128,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): - """`input_ids_shape` is expected to be [bsz x seqlen].""" - bsz, seq_len = input_ids_shape[:2] + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ) + ).expand(bsz, -1) + return super().forward(positions + self.offset) @@ -788,17 +790,17 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) + input = input_ids + input_ids = input_ids.view(-1, input_ids.shape[-1]) elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - embed_pos = self.embed_positions(input_shape) + embed_pos = self.embed_positions(input) hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) @@ -1013,10 +1015,12 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input.shape input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -1024,7 +1028,7 @@ def forward( past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + inputs_embeds = self.embed_tokens(input) * self.embed_scale attention_mask = self._prepare_decoder_attention_mask( attention_mask, input_shape, inputs_embeds, past_key_values_length @@ -1036,7 +1040,7 @@ def forward( encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) # embed positions - positions = self.embed_positions(input_shape, past_key_values_length) + positions = self.embed_positions(input, past_key_values_length) hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index b9057178a03227..ad3d4243b0aed0 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -134,12 +134,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): - """`input_ids_shape` is expected to be [bsz x seqlen].""" - bsz, seq_len = input_ids_shape[:2] + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ) + ).expand(bsz, -1) + return super().forward(positions + self.offset) @@ -783,17 +785,18 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input.shape input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - embed_pos = self.embed_positions(input_shape) + embed_pos = self.embed_positions(input) hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) @@ -1011,10 +1014,12 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -1034,7 +1039,7 @@ def forward( encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) # embed positions - positions = self.embed_positions(input_shape, past_key_values_length) + positions = self.embed_positions(input, past_key_values_length) hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index d3d239c4cff125..37c1a7d837f7ba 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -134,12 +134,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): - """`input_ids_shape` is expected to be [bsz x seqlen].""" - bsz, seq_len = input_ids_shape[:2] + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ) + ).expand(bsz, -1) + return super().forward(positions + self.offset) @@ -895,17 +897,19 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input.shape input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - embed_pos = self.embed_positions(input_shape) + embed_pos = self.embed_positions(input) hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) @@ -1144,10 +1148,12 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input_ids.shape input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -1167,7 +1173,7 @@ def forward( encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) # embed positions - positions = self.embed_positions(input_shape, past_key_values_length) + positions = self.embed_positions(input, past_key_values_length) hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index eb8b5d2b41671c..de530683f34ed6 100755 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -131,12 +131,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): - """`input_ids_shape` is expected to be [bsz x seqlen].""" - bsz, seq_len = input_ids_shape[:2] + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ) + ).expand(bsz, -1) + return super().forward(positions + self.offset) @@ -759,17 +761,17 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) + input = input_ids + input_ids = input_ids.view(-1, input_ids.shape[-1]) elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - embed_pos = self.embed_positions(input_shape) + embed_pos = self.embed_positions(input) hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) @@ -985,10 +987,12 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() + input = input_ids + input_shape = input.shape input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -996,7 +1000,7 @@ def forward( past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + inputs_embeds = self.embed_tokens(input) * self.embed_scale attention_mask = self._prepare_decoder_attention_mask( attention_mask, input_shape, inputs_embeds, past_key_values_length @@ -1008,7 +1012,7 @@ def forward( encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) # embed positions - positions = self.embed_positions(input_shape, past_key_values_length) + positions = self.embed_positions(input, past_key_values_length) hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index a79e5e901d67c4..e25f73c8b7d3b5 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -87,12 +87,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int): self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): - """`input_ids_shape` is expected to be [bsz x seqlen].""" - bsz, seq_len = input_ids_shape[:2] + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ) + ).expand(bsz, -1) + return super().forward(positions + self.offset) @@ -626,10 +628,11 @@ def forward( if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) + input = input_ids + input_ids = input_ids.view(-1, input.shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") @@ -640,7 +643,7 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale if self.config.use_learned_position_embeddings: - embed_pos = self.embed_positions(input_shape, past_key_values_length=past_key_values_length) + embed_pos = self.embed_positions(input, past_key_values_length=past_key_values_length) else: embed_pos = self.embed_positions(input_ids, past_key_values_length=past_key_values_length) @@ -651,6 +654,8 @@ def forward( hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + input_shape = input.shape + attention_mask = self._prepare_decoder_attention_mask( attention_mask, input_shape, inputs_embeds, past_key_values_length )