From 5a05758f5bd316a832387c4b6969f443bc56bf7d Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Thu, 2 Jul 2020 16:18:40 -0400 Subject: [PATCH 01/11] encoder --- src/transformers/modeling_bert.py | 101 +++++++++++++++++++----------- 1 file changed, 65 insertions(+), 36 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 23d25cfa09847a..1354bf9d57e1d8 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -19,6 +19,7 @@ import logging import math import os +from typing import List, Optional, Tuple import warnings import torch @@ -141,6 +142,12 @@ def mish(x): ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish} +def act_fn(name: str, input: torch.Tensor) -> torch.Tensor: + if name == "gelu": return gelu(input) + if name == "relu": return torch.nn.functional.relu(input) + if name == "swish": return swish(input) + if name == "gelu_new": return gelu_new(input) + else: raise KeyError("Unknown activation function '" + name + "'!") BertLayerNorm = torch.nn.LayerNorm @@ -160,14 +167,21 @@ def __init__(self, config): self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): + def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None): if input_ids is not None: input_shape = input_ids.size() else: + assert inputs_embeds is not None input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] - device = input_ids.device if input_ids is not None else inputs_embeds.device + + if input_ids is not None: + device = input_ids.device + else: + assert inputs_embeds is not None + device = inputs_embeds.device + if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) @@ -175,7 +189,9 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: + assert input_ids is not None inputs_embeds = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) @@ -206,18 +222,19 @@ def __init__(self, config): def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) + x = x.view(new_x_shape[0], new_x_shape[1], + new_x_shape[2], new_x_shape[3]) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - ): + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys @@ -243,7 +260,7 @@ def forward( attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -257,9 +274,11 @@ def forward( context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(*new_context_layer_shape) + context_layer = context_layer.view(new_context_layer_shape[0], + new_context_layer_shape[1], + new_context_layer_shape[2]) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer, None) return outputs @@ -270,7 +289,7 @@ def __init__(self, config): self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, hidden_states, input_tensor): + def forward(self, hidden_states, input_tensor) -> torch.Tensor : hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -305,17 +324,17 @@ def prune_heads(self, heads): def forward( self, hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - ): + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions, ) attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output, self_outputs[1]) # add attentions if we output them return outputs @@ -323,14 +342,11 @@ class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act + self.act_fn_name = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = act_fn(self.act_fn_name, hidden_states) return hidden_states @@ -347,12 +363,22 @@ def forward(self, hidden_states, input_tensor): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states +class DummyModule(nn.Module): + """ + Dummy Module as a placeholder for cross attention Module. + Used to appease TorchScript in BertLayers which are not decoders. + """ + def __init__(self): + super().__init__() + def forward(self, a, b: Optional[torch.Tensor], c: Optional[torch.Tensor], d: Optional[torch.Tensor], e: Optional[torch.Tensor]): + return torch.zeros([0]) class BertLayer(nn.Module): def __init__(self, config): super().__init__() self.attention = BertAttention(config) self.is_decoder = config.is_decoder + self.crossattention = DummyModule() # dummy module if self.is_decoder: self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) @@ -361,18 +387,22 @@ def __init__(self, config): def forward( self, hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - ): + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Returns the output hidden states, the self-attention weights (if returned), + and the cross-attention weights (if the layer is a decoder layer, and weights are returned). + """ self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, ) attention_output = self_attention_outputs[0] - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - + self_attention_weights = self_attention_outputs[1] + cross_attention_weights : Optional[torch.Tensor] = None if self.is_decoder and encoder_hidden_states is not None: cross_attention_outputs = self.crossattention( attention_output, @@ -383,14 +413,13 @@ def forward( output_attentions, ) attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + cross_attention_weights = cross_attention_outputs[1] intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - outputs = (layer_output,) + outputs + outputs = (layer_output, self_attention_weights, cross_attention_weights) return outputs - class BertEncoder(nn.Module): def __init__(self, config): super().__init__() From 2a07ae543ac4238c130c36f8a6e770dd576f9a16 Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Thu, 2 Jul 2020 17:28:58 -0400 Subject: [PATCH 02/11] encoder - checkpoint --- src/transformers/modeling_bert.py | 71 +++++++++++++++++-------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 1354bf9d57e1d8..8f76bca0956541 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -278,7 +278,7 @@ def forward( new_context_layer_shape[1], new_context_layer_shape[2]) - outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer, None) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None) return outputs @@ -370,7 +370,12 @@ class DummyModule(nn.Module): """ def __init__(self): super().__init__() - def forward(self, a, b: Optional[torch.Tensor], c: Optional[torch.Tensor], d: Optional[torch.Tensor], e: Optional[torch.Tensor]): + def forward(self, a, + b: Optional[torch.Tensor], + c: Optional[torch.Tensor], + d: Optional[torch.Tensor], + e: Optional[torch.Tensor], + f: bool): return torch.zeros([0]) class BertLayer(nn.Module): @@ -429,19 +434,22 @@ def __init__(self, config): def forward( self, hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=False, - output_hidden_states=False, - ): - all_hidden_states = () - all_attentions = () - for i, layer_module in enumerate(self.layer): + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + ) -> Tuple[torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + all_hidden_states = [] + all_attentions = [] + i = 0 + for layer_module in self.layer: if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - + all_hidden_states = all_hidden_states.append(hidden_states) + layer_head_mask = None if head_mask is None else head_mask[i] + + """ if getattr(self.config, "gradient_checkpointing", False): def create_custom_forward(module): @@ -454,34 +462,35 @@ def custom_forward(*inputs): create_custom_forward(layer_module), hidden_states, attention_mask, - head_mask[i], + layer_head_mask, encoder_hidden_states, encoder_attention_mask, ) else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - head_mask[i], - encoder_hidden_states, - encoder_attention_mask, - output_attentions, - ) + """ + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) + attn = layer_outputs[1] + assert attn is not None + all_attentions.append(attn) + i += 1 # Add last layer if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + all_hidden_states.append(hidden_states) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - return outputs # last-layer hidden state, (all hidden states), (all attentions) + return (hidden_states, + all_hidden_states if output_hidden_states else None, + all_attentions if output_attentions else None) class BertPooler(nn.Module): From d285de57d82c5b436eedba96f4f9956050fcf1db Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Mon, 20 Jul 2020 10:58:31 -0400 Subject: [PATCH 03/11] stage 1: BertModel is scriptable and tests pass --- src/transformers/modeling_bert.py | 119 ++++++++++++++++++++--------- src/transformers/modeling_utils.py | 56 +++++++++----- 2 files changed, 121 insertions(+), 54 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 8f76bca0956541..2bc6e45af057c5 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -151,6 +151,8 @@ def act_fn(name: str, input: torch.Tensor) -> torch.Tensor: BertLayerNorm = torch.nn.LayerNorm +def filter_outputs(module: nn.Module, *args, **kwargs): + return tuple(x for x in module.forward(*args, **kwargs) if x is not None) class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. @@ -167,7 +169,11 @@ def __init__(self, config): self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None): + def forward(self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None): if input_ids is not None: input_shape = input_ids.size() else: @@ -201,7 +207,7 @@ def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Opti return embeddings -class BertSelfAttention(nn.Module): +class BertScriptableSelfAttention(nn.Module): def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): @@ -228,7 +234,7 @@ def transpose_for_scores(self, x): def forward( self, - hidden_states, + hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, @@ -281,6 +287,11 @@ def forward( outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None) return outputs +class BertSelfAttention(BertScriptableSelfAttention): + def __init__(self, config): + super().__init__(config) + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) class BertSelfOutput(nn.Module): def __init__(self, config): @@ -296,10 +307,10 @@ def forward(self, hidden_states, input_tensor) -> torch.Tensor : return hidden_states -class BertAttention(nn.Module): +class BertScriptableAttention(nn.Module): def __init__(self, config): super().__init__() - self.self = BertSelfAttention(config) + self.self = BertScriptableSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() @@ -337,6 +348,11 @@ def forward( outputs = (attention_output, self_outputs[1]) # add attentions if we output them return outputs +class BertAttention(BertScriptableAttention): + def __init__(self, config): + super().__init__(config) + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) class BertIntermediate(nn.Module): def __init__(self, config): @@ -378,14 +394,14 @@ def forward(self, a, f: bool): return torch.zeros([0]) -class BertLayer(nn.Module): +class BertScriptableLayer(nn.Module): def __init__(self, config): super().__init__() - self.attention = BertAttention(config) + self.attention = BertScriptableAttention(config) self.is_decoder = config.is_decoder self.crossattention = DummyModule() # dummy module if self.is_decoder: - self.crossattention = BertAttention(config) + self.crossattention = BertScriptableAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) @@ -425,15 +441,21 @@ def forward( outputs = (layer_output, self_attention_weights, cross_attention_weights) return outputs -class BertEncoder(nn.Module): +class BertLayer(BertScriptableLayer): + def __init__(self, config): + super().__init__(config) + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) + +class BertScriptableEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([BertScriptableLayer(config) for _ in range(config.num_hidden_layers)]) def forward( self, - hidden_states, + hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, @@ -446,7 +468,7 @@ def forward( i = 0 for layer_module in self.layer: if output_hidden_states: - all_hidden_states = all_hidden_states.append(hidden_states) + all_hidden_states.append(hidden_states) layer_head_mask = None if head_mask is None else head_mask[i] """ @@ -492,6 +514,11 @@ def custom_forward(*inputs): all_hidden_states if output_hidden_states else None, all_attentions if output_attentions else None) +class BertEncoder(BertScriptableEncoder): + def __init__(self, config): + super().__init__(config) + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) class BertPooler(nn.Module): def __init__(self, config): @@ -662,7 +689,7 @@ def _init_weights(self, module): "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, ) -class BertModel(BertPreTrainedModel): +class BertScriptableModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well @@ -679,16 +706,25 @@ class BertModel(BertPreTrainedModel): """ - def __init__(self, config): + def __init__(self, config: BertConfig): super().__init__(config) - self.config = config - + + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.is_decoder = config.is_decoder + self.num_hidden_layers = config.num_hidden_layers + self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) + self.encoder = BertScriptableEncoder(config) self.pooler = BertPooler(config) - + # TorchScript cannot use parameters().next(), so get dtype explicitly. + self.mask_dtype = self.embeddings.word_embeddings(torch.LongTensor([0])).dtype + self.init_weights() + def get_dtype(self): + return self.mask_dtype + def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -707,16 +743,16 @@ def _prune_heads(self, heads_to_prune): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - output_attentions=None, - output_hidden_states=None, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, ): r""" Return: @@ -744,9 +780,9 @@ def forward( Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.output_hidden_states ) if input_ids is not None and inputs_embeds is not None: @@ -758,7 +794,11 @@ def forward( else: raise ValueError("You have to specify either input_ids or inputs_embeds") - device = input_ids.device if input_ids is not None else inputs_embeds.device + if input_ids is not None: + device = input_ids.device + else: + assert inputs_embeds is not None + device = inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) @@ -767,11 +807,11 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, (input_shape[0], input_shape[1]), device) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: + if self.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: @@ -785,7 +825,7 @@ def forward( # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + head_mask = self.get_scriptable_head_mask(head_mask, self.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds @@ -802,12 +842,17 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here + outputs = (sequence_output, pooled_output, encoder_outputs[1], encoder_outputs[2]) return outputs # sequence_output, pooled_output, (hidden_states), (attentions) +class BertModel(BertScriptableModel): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return tuple(x for x in super().forward(*args, **kwargs) if x is not None) + @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 5b000c8125d17c..d4b62f14fbd59a 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -141,8 +141,8 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: first_tuple = next(gen) return first_tuple[1].device - @property - def dtype(self) -> dtype: + # TorchScript does not support, so add non-property option + def get_dtype(self) -> dtype: """ Get torch.dtype from module, assuming that the whole module has one dtype. """ @@ -158,34 +158,40 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].dtype + + @property + def dtype(self): + return self.get_dtype() def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: """type: torch.Tensor -> torch.Tensor""" + encoder_extended_attention_mask: Optional[Tensor] = None if encoder_attention_mask.dim() == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + assert encoder_extended_attention_mask is not None # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == # encoder_extended_attention_mask.transpose(-1, -2)) - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.get_dtype()) # fp16 compatibility - if self.dtype == torch.float16: + if self.get_dtype() == torch.float16: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 - elif self.dtype == torch.float32: + elif self.get_dtype() == torch.float32: encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: raise ValueError( "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format( - self.dtype + self.get_dtype() ) ) return encoder_extended_attention_mask - def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple, device: device) -> Tensor: + def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int, int], device: device) -> Tensor: """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored. Arguments: @@ -204,7 +210,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder: + if self.is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] @@ -225,7 +231,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.to(dtype=self.get_dtype()) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask @@ -241,16 +247,32 @@ def get_head_mask(self, head_mask: Tensor, num_hidden_layers: int, is_attention_ Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] or list with [None] for each layer """ - if head_mask is not None: + if head_mask is None: + return [None] * num_hidden_layers + else: + return self.get_scriptable_head_mask(head_mask, num_hidden_layers, is_attention_chunked) + + def get_scriptable_head_mask(self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False) -> Optional[Tensor]: + """ + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + attention_probs has shape bsz x n_heads x N x N + Arguments: + head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads] + num_hidden_layers: int + Returns: + Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + or None + """ + if head_mask is None: + return None + else: head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) - else: - head_mask = [None] * num_hidden_layers - - return head_mask - - def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers: int): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) @@ -258,7 +280,7 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" - head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility + head_mask = head_mask.to(dtype=self.get_dtype()) # switch to fload if need + fp16 compatibility return head_mask From fe14b7efbc07eec2715e8ecfd8834abcbfccd791 Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Mon, 20 Jul 2020 14:32:11 -0400 Subject: [PATCH 04/11] all but BertForLM --- src/transformers/modeling_bert.py | 319 +++++++++++++++++------------ src/transformers/modeling_utils.py | 6 +- 2 files changed, 190 insertions(+), 135 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 2bc6e45af057c5..0a98e1177051a8 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -851,20 +851,21 @@ def __init__(self, config: BertConfig): super().__init__(config) self.config = config def forward(self, *args, **kwargs): - return tuple(x for x in super().forward(*args, **kwargs) if x is not None) + return filter_outputs(super(), *args, **kwargs) @add_start_docstrings( """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, ) -class BertForPreTraining(BertPreTrainedModel): +class BertScriptableForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) - self.bert = BertModel(config) + self.bert = BertScriptableModel(config) self.cls = BertPreTrainingHeads(config) - + self.vocab_size = config.vocab_size + self.init_weights() def get_output_embeddings(self): @@ -873,18 +874,18 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - next_sentence_label=None, - output_attentions=None, - output_hidden_states=None, - **kwargs - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] =None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. @@ -935,14 +936,6 @@ def forward( >>> prediction_scores, seq_relationship_scores = outputs[:2] """ - if "masked_lm_labels" in kwargs: - warnings.warn( - "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", - DeprecationWarning, - ) - labels = kwargs.pop("masked_lm_labels") - assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -954,21 +947,52 @@ def forward( output_hidden_states=output_hidden_states, ) - sequence_output, pooled_output = outputs[:2] + sequence_output, pooled_output, hidden_states, attentions = outputs prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here - + total_loss: Optional[torch.Tensor] = None if labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - outputs = (total_loss,) + outputs + + return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) - return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) +class BertForPreTraining(BertScriptableForPreTraining): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + + def forward(self, + input_ids = None, + attention_mask = None, + token_type_ids = None, + position_ids = None, + head_mask = None, + inputs_embeds = None, + labels = None, + next_sentence_label = None, + output_attentions =None, + output_hidden_states = None, + **kwargs): + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + + return tuple(x for x in super().forward(input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + inputs_embeds, + labels, + next_sentence_label, + output_attentions, + output_hidden_states) if x is not None) @add_start_docstrings( @@ -1196,11 +1220,11 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, ) -class BertForNextSentencePrediction(BertPreTrainedModel): +class BertScriptableForNextSentencePrediction(BertPreTrainedModel): def __init__(self, config): super().__init__(config) - self.bert = BertModel(config) + self.bert = BertScriptableModel(config) self.cls = BertOnlyNSPHead(config) self.init_weights() @@ -1208,16 +1232,16 @@ def __init__(self, config): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - next_sentence_label=None, - output_attentions=None, - output_hidden_states=None, - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) @@ -1271,29 +1295,35 @@ def forward( ) pooled_output = outputs[1] + hidden_states, attentions = outputs[2], outputs[3] seq_relationship_score = self.cls(pooled_output) - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + next_sentence_loss : Optional[torch.Tensor] = None if next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - outputs = (next_sentence_loss,) + outputs + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + + return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) - return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) +class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) -class BertForSequenceClassification(BertPreTrainedModel): +class BertScriptableForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - self.bert = BertModel(config) + self.bert = BertScriptableModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) @@ -1303,16 +1333,16 @@ def __init__(self, config): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. @@ -1351,35 +1381,37 @@ def forward( ) pooled_output = outputs[1] - + hidden_states, attentions = outputs[2], outputs[3] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - + loss: Optional[torch.Tensor] = None if labels is not None: if self.num_labels == 1: # We are doing regression - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) + loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1)) else: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - - return outputs # (loss), logits, (hidden_states), (attentions) + loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) + + return (loss, logits, hidden_states, attentions) +class BertForSequenceClassification(BertScriptableForSequenceClassification): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) @add_start_docstrings( """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING, ) -class BertForMultipleChoice(BertPreTrainedModel): +class BertScriptableForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) - self.bert = BertModel(config) + self.bert = BertScriptableModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) @@ -1389,16 +1421,16 @@ def __init__(self, config): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. @@ -1425,7 +1457,11 @@ def forward( Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + if input_ids is not None: + num_choices = input_ids.shape[1] + else: + assert inputs_embeds is not None + num_choices = inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None @@ -1449,32 +1485,36 @@ def forward( ) pooled_output = outputs[1] - + hidden_states, attentions = outputs[2], outputs[3] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - + loss: Optional[torch.Tensor] = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - outputs = (loss,) + outputs - - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + loss = nn.functional.cross_entropy(reshaped_logits, labels) + + return (loss, reshaped_logits, hidden_states, attentions) +class BertForMultipleChoice(BertScriptableForMultipleChoice): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) @add_start_docstrings( """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, ) -class BertForTokenClassification(BertPreTrainedModel): +class BertScriptableForTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - - self.bert = BertModel(config) + self.ignore_index = nn.CrossEntropyLoss().ignore_index + + self.bert = BertScriptableModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) @@ -1484,16 +1524,16 @@ def __init__(self, config): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. @@ -1530,39 +1570,44 @@ def forward( ) sequence_output = outputs[0] + hidden_states, attentions = outputs[2], outputs[3] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss: Optional[torch.Tensor] = None if labels is not None: - loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( - active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + active_loss, labels.view(-1), torch.tensor(self.ignore_index).type_as(labels) ) - loss = loss_fct(active_logits, active_labels) + loss = nn.functional.cross_entropy(active_logits, active_labels) else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs + loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) - return outputs # (loss), scores, (hidden_states), (attentions) + return (loss, logits, hidden_states, attentions) +class BertForTokenClassification(BertScriptableForTokenClassification): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) @add_start_docstrings( """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BERT_START_DOCSTRING, ) -class BertForQuestionAnswering(BertPreTrainedModel): +class BertScriptableForQuestionAnswering(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - self.bert = BertModel(config) + self.bert = BertScriptableModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @@ -1571,17 +1616,17 @@ def __init__(self, config): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - output_attentions=None, - output_hidden_states=None, - ): + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1625,13 +1670,14 @@ def forward( ) sequence_output = outputs[0] - + hidden_states, attentions = outputs[2], outputs[3] + logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) - outputs = (start_logits, end_logits,) + outputs[2:] + total_loss: Optional[torch.Tensor] = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: @@ -1643,10 +1689,15 @@ def forward( start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) + start_loss = nn.functional.cross_entropy(start_logits, start_positions, ignore_index=ignored_index) + end_loss = nn.functional.cross_entropy(end_logits, end_positions, ignore_index=ignored_index) total_loss = (start_loss + end_loss) / 2 - outputs = (total_loss,) + outputs + + return (total_loss, start_logits, end_logits, hidden_states, attentions) - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) +class BertForQuestionAnswering(BertScriptableForQuestionAnswering): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index d4b62f14fbd59a..6cf0d38adea7f6 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -191,6 +191,10 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: return encoder_extended_attention_mask + def get_is_decoder(self): + if hasattr(self, "is_decoder"): return self.is_decoder + else: return self.config.is_decoder + def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int, int], device: device) -> Tensor: """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored. @@ -210,7 +214,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.is_decoder: + if self.get_is_decoder(): batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] From 74c1bb8c666e728d1cb0bd244fa8873a52bc2e30 Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Mon, 20 Jul 2020 15:12:22 -0400 Subject: [PATCH 05/11] move comments --- src/transformers/modeling_bert.py | 543 +++++++++++++++--------------- 1 file changed, 276 insertions(+), 267 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 0a98e1177051a8..d5a547c6d09faa 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -684,26 +684,8 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ - -@add_start_docstrings( - "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", - BERT_START_DOCSTRING, -) class BertScriptableModel(BertPreTrainedModel): - """ - - The model can behave as an encoder (with only self-attention) as well - as a decoder, in which case a layer of cross-attention is added between - the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, - Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the - :obj:`is_decoder` argument of the configuration set to :obj:`True`; an - :obj:`encoder_hidden_states` is expected as an input to the forward pass. - - .. _`Attention is all you need`: - https://arxiv.org/abs/1706.03762 - + """ Torch-Script-able version of BertModel, below. """ def __init__(self, config: BertConfig): @@ -739,8 +721,6 @@ def _prune_heads(self, heads_to_prune): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -754,32 +734,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ): - r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during pre-training. - - This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.output_hidden_states @@ -846,18 +800,61 @@ def forward( return outputs # sequence_output, pooled_output, (hidden_states), (attentions) +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) class BertModel(BertScriptableModel): + """ + + The model can behave as an encoder (with only self-attention) as well + as a decoder, in which case a layer of cross-attention is added between + the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, + Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + :obj:`is_decoder` argument of the configuration set to :obj:`True`; an + :obj:`encoder_hidden_states` is expected as an input to the forward pass. + + .. _`Attention is all you need`: + https://arxiv.org/abs/1706.03762 + + """ def __init__(self, config: BertConfig): super().__init__(config) self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during pre-training. + + This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ return filter_outputs(super(), *args, **kwargs) -@add_start_docstrings( - """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and - a `next sentence prediction (classification)` head. """, - BERT_START_DOCSTRING, -) class BertScriptableForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -871,7 +868,6 @@ def __init__(self, config): def get_output_embeddings(self): return self.cls.predictions.decoder - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -886,6 +882,51 @@ def forward( output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output, pooled_output, hidden_states, attentions = outputs + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss: Optional[torch.Tensor] = None + if labels is not None and next_sentence_label is not None: + masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) + +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and + a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertScriptableForPreTraining): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + def forward(self, + input_ids = None, + attention_mask = None, + token_type_ids = None, + position_ids = None, + head_mask = None, + inputs_embeds = None, + labels = None, + next_sentence_label = None, + output_attentions =None, + output_hidden_states = None, + **kwargs): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. @@ -936,45 +977,6 @@ def forward( >>> prediction_scores, seq_relationship_scores = outputs[:2] """ - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - sequence_output, pooled_output, hidden_states, attentions = outputs - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - - total_loss: Optional[torch.Tensor] = None - if labels is not None and next_sentence_label is not None: - masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) - next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - - return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) - -class BertForPreTraining(BertScriptableForPreTraining): - def __init__(self, config: BertConfig): - super().__init__(config) - self.config = config - - def forward(self, - input_ids = None, - attention_mask = None, - token_type_ids = None, - position_ids = None, - head_mask = None, - inputs_embeds = None, - labels = None, - next_sentence_label = None, - output_attentions =None, - output_hidden_states = None, - **kwargs): if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", @@ -1217,10 +1219,10 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ return {"input_ids": input_ids, "attention_mask": attention_mask} -@add_start_docstrings( - """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, -) class BertScriptableForNextSentencePrediction(BertPreTrainedModel): + """ + Torch-Script-able version of BertForNextSentencePrediction, below + """ def __init__(self, config): super().__init__(config) @@ -1229,7 +1231,6 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1242,6 +1243,38 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + pooled_output = outputs[1] + hidden_states, attentions = outputs[2], outputs[3] + + seq_relationship_score = self.cls(pooled_output) + + next_sentence_loss : Optional[torch.Tensor] = None + if next_sentence_label is not None: + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + + return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + def forward(self, *args, **kwargs): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) @@ -1282,7 +1315,32 @@ def forward( >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ + return filter_outputs(super(), *args, **kwargs) +class BertScriptableForSequenceClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertScriptableModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1296,53 +1354,32 @@ def forward( pooled_output = outputs[1] hidden_states, attentions = outputs[2], outputs[3] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) - seq_relationship_score = self.cls(pooled_output) - - next_sentence_loss : Optional[torch.Tensor] = None - if next_sentence_label is not None: - next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - - return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) - - -class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): - def __init__(self, config: BertConfig): - super().__init__(config) - self.config = config - def forward(self, *args, **kwargs): - return filter_outputs(super(), *args, **kwargs) + loss: Optional[torch.Tensor] = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1)) + else: + loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) + + return (loss, logits, hidden_states, attentions) @add_start_docstrings( """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, ) -class BertScriptableForSequenceClassification(BertPreTrainedModel): - def __init__(self, config): +class BertForSequenceClassification(BertScriptableForSequenceClassification): + def __init__(self, config: BertConfig): super().__init__(config) - self.num_labels = config.num_labels - - self.bert = BertScriptableModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - + self.config = config + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + def forward(self, *args, **kwargs): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. @@ -1368,45 +1405,8 @@ def forward( Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - pooled_output = outputs[1] - hidden_states, attentions = outputs[2], outputs[3] - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss: Optional[torch.Tensor] = None - if labels is not None: - if self.num_labels == 1: - # We are doing regression - loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1)) - else: - loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) - - return (loss, logits, hidden_states, attentions) - -class BertForSequenceClassification(BertScriptableForSequenceClassification): - def __init__(self, config: BertConfig): - super().__init__(config) - self.config = config - def forward(self, *args, **kwargs): return filter_outputs(super(), *args, **kwargs) -@add_start_docstrings( - """Bert Model with a multiple choice classification head on top (a linear layer on top of - the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - BERT_START_DOCSTRING, -) class BertScriptableForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1417,8 +1417,6 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1431,32 +1429,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): - Classification loss. - classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): - `num_choices` is the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ + if input_ids is not None: num_choices = input_ids.shape[1] else: @@ -1496,18 +1469,47 @@ def forward( return (loss, reshaped_logits, hidden_states, attentions) +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, +) class BertForMultipleChoice(BertScriptableForMultipleChoice): def __init__(self, config: BertConfig): super().__init__(config) self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ return filter_outputs(super(), *args, **kwargs) -@add_start_docstrings( - """Bert Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - BERT_START_DOCSTRING, -) class BertScriptableForTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1520,8 +1522,6 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1534,30 +1534,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : - Classification loss. - scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1590,18 +1567,44 @@ def forward( return (loss, logits, hidden_states, attentions) +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, +) class BertForTokenClassification(BertScriptableForTokenClassification): def __init__(self, config: BertConfig): super().__init__(config) self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ return filter_outputs(super(), *args, **kwargs) -@add_start_docstrings( - """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - BERT_START_DOCSTRING, -) class BertScriptableForQuestionAnswering(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1612,8 +1615,6 @@ def __init__(self, config): self.init_weights() - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1627,37 +1628,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1695,9 +1666,47 @@ def forward( return (total_loss, start_logits, end_logits, hidden_states, attentions) +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, +) class BertForQuestionAnswering(BertScriptableForQuestionAnswering): def __init__(self, config: BertConfig): super().__init__(config) self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ return filter_outputs(super(), *args, **kwargs) From f1504fa22162bbaccf7840f5b475f90bbf20d56b Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Mon, 20 Jul 2020 15:37:26 -0400 Subject: [PATCH 06/11] remove whitespace --- src/transformers/modeling_bert.py | 48 +++++++++++++++--------------- src/transformers/modeling_utils.py | 4 +-- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index d5a547c6d09faa..b10804283226ae 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -470,7 +470,7 @@ def forward( if output_hidden_states: all_hidden_states.append(hidden_states) layer_head_mask = None if head_mask is None else head_mask[i] - + """ if getattr(self.config, "gradient_checkpointing", False): @@ -690,23 +690,23 @@ class BertScriptableModel(BertPreTrainedModel): def __init__(self, config: BertConfig): super().__init__(config) - + self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder self.num_hidden_layers = config.num_hidden_layers - + self.embeddings = BertEmbeddings(config) self.encoder = BertScriptableEncoder(config) self.pooler = BertPooler(config) # TorchScript cannot use parameters().next(), so get dtype explicitly. self.mask_dtype = self.embeddings.word_embeddings(torch.LongTensor([0])).dtype - + self.init_weights() def get_dtype(self): return self.mask_dtype - + def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -823,7 +823,7 @@ class BertModel(BertScriptableModel): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): @@ -862,7 +862,7 @@ def __init__(self, config): self.bert = BertScriptableModel(config) self.cls = BertPreTrainingHeads(config) self.vocab_size = config.vocab_size - + self.init_weights() def get_output_embeddings(self): @@ -881,7 +881,7 @@ def forward( output_attentions: Optional[bool] =None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -901,7 +901,7 @@ def forward( masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - + return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) @add_start_docstrings( @@ -1262,7 +1262,7 @@ def forward( next_sentence_loss : Optional[torch.Tensor] = None if next_sentence_label is not None: next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - + return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) @add_start_docstrings( @@ -1272,7 +1272,7 @@ class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def forward(self, *args, **kwargs): r""" @@ -1340,7 +1340,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1364,7 +1364,7 @@ def forward( loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1)) else: loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) - + return (loss, logits, hidden_states, attentions) @add_start_docstrings( @@ -1376,7 +1376,7 @@ class BertForSequenceClassification(BertScriptableForSequenceClassification): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): @@ -1429,7 +1429,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - + if input_ids is not None: num_choices = input_ids.shape[1] else: @@ -1466,7 +1466,7 @@ def forward( loss: Optional[torch.Tensor] = None if labels is not None: loss = nn.functional.cross_entropy(reshaped_logits, labels) - + return (loss, reshaped_logits, hidden_states, attentions) @add_start_docstrings( @@ -1478,7 +1478,7 @@ class BertForMultipleChoice(BertScriptableForMultipleChoice): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): @@ -1515,7 +1515,7 @@ def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.ignore_index = nn.CrossEntropyLoss().ignore_index - + self.bert = BertScriptableModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) @@ -1534,7 +1534,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1576,7 +1576,7 @@ class BertForTokenClassification(BertScriptableForTokenClassification): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): @@ -1628,7 +1628,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - + outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1642,7 +1642,7 @@ def forward( sequence_output = outputs[0] hidden_states, attentions = outputs[2], outputs[3] - + logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) @@ -1663,7 +1663,7 @@ def forward( start_loss = nn.functional.cross_entropy(start_logits, start_positions, ignore_index=ignored_index) end_loss = nn.functional.cross_entropy(end_logits, end_positions, ignore_index=ignored_index) total_loss = (start_loss + end_loss) / 2 - + return (total_loss, start_logits, end_logits, hidden_states, attentions) @add_start_docstrings( @@ -1675,7 +1675,7 @@ class BertForQuestionAnswering(BertScriptableForQuestionAnswering): def __init__(self, config: BertConfig): super().__init__(config) self.config = config - + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 6cf0d38adea7f6..e6e3f7e3b4d139 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -158,7 +158,7 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].dtype - + @property def dtype(self): return self.get_dtype() @@ -275,7 +275,7 @@ def get_scriptable_head_mask(self, head_mask: Optional[Tensor], num_hidden_layer if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) return head_mask - + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers: int): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: From 9b360d7e1f5c1c6c6b9410d4b3c7d06442074e62 Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Wed, 22 Jul 2020 11:11:41 -0400 Subject: [PATCH 07/11] add scripting test --- tests/test_modeling_bert.py | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index 8a689fb8434c90..648e4b2d820532 100644 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -525,6 +525,83 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs) + # Copied from test_modeling_common.test_torchscript, but using jit.script, not jit.trace + def test_full_torchscript(self): + import copy + import tempfile + import os + + if is_torch_available(): + import torch + from transformers.modeling_bert import ( + BertScriptableModel, + BertScriptableForMultipleChoice, + BertScriptableForNextSentencePrediction, + BertScriptableForPreTraining, + BertScriptableForQuestionAnswering, + BertScriptableForSequenceClassification, + BertScriptableForTokenClassification, + ) + + config, unused_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + configs_no_init = copy.deepcopy(config) + for key in configs_no_init.__dict__.keys(): + if "_range" in key or "_std" in key or "initializer_factor" in key: + setattr(configs_no_init, key, 1e-10) + + scriptable_model_classes = ( + BertScriptableModel, + BertScriptableForMultipleChoice, + BertScriptableForNextSentencePrediction, + BertScriptableForPreTraining, + BertScriptableForQuestionAnswering, + BertScriptableForSequenceClassification, + BertScriptableForTokenClassification, + ) + for model_class in scriptable_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + scripted = torch.jit.script(model) + except RuntimeError: + self.fail("Couldn't script module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "scripted_model.pt") + + try: + torch.jit.save(scripted, pt_file_name) + except Exception: + self.fail("Couldn't save scripted module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load scripted module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + + @slow def test_model_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: From 9aae7cce89c55dc25de3af87ddc1ce17a5a525da Mon Sep 17 00:00:00 2001 From: Sam Brody Date: Wed, 22 Jul 2020 11:41:16 -0400 Subject: [PATCH 08/11] put comments back on scriptable for diff --- src/transformers/modeling_bert.py | 491 +++++++++++++++--------------- 1 file changed, 242 insertions(+), 249 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index b10804283226ae..f32230242db845 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -684,10 +684,26 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) class BertScriptableModel(BertPreTrainedModel): - """ Torch-Script-able version of BertModel, below. """ + The model can behave as an encoder (with only self-attention) as well + as a decoder, in which case a layer of cross-attention is added between + the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, + Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + :obj:`is_decoder` argument of the configuration set to :obj:`True`; an + :obj:`encoder_hidden_states` is expected as an input to the forward pass. + + .. _`Attention is all you need`: + https://arxiv.org/abs/1706.03762 + + """ def __init__(self, config: BertConfig): super().__init__(config) @@ -734,6 +750,32 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during pre-training. + + This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.output_hidden_states @@ -800,26 +842,7 @@ def forward( return outputs # sequence_output, pooled_output, (hidden_states), (attentions) -@add_start_docstrings( - "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", - BERT_START_DOCSTRING, -) -class BertModel(BertScriptableModel): - """ - - The model can behave as an encoder (with only self-attention) as well - as a decoder, in which case a layer of cross-attention is added between - the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, - Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - - To behave as an decoder the model needs to be initialized with the - :obj:`is_decoder` argument of the configuration set to :obj:`True`; an - :obj:`encoder_hidden_states` is expected as an input to the forward pass. - - .. _`Attention is all you need`: - https://arxiv.org/abs/1706.03762 - - """ +class BertModel(BertScriptableModel): def __init__(self, config: BertConfig): super().__init__(config) self.config = config @@ -827,34 +850,13 @@ def __init__(self, config: BertConfig): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): - r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during pre-training. - - This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ return filter_outputs(super(), *args, **kwargs) +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and + a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, +) class BertScriptableForPreTraining(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -881,52 +883,6 @@ def forward( output_attentions: Optional[bool] =None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - sequence_output, pooled_output, hidden_states, attentions = outputs - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - - total_loss: Optional[torch.Tensor] = None - if labels is not None and next_sentence_label is not None: - masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) - next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - - return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) - -@add_start_docstrings( - """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and - a `next sentence prediction (classification)` head. """, - BERT_START_DOCSTRING, -) -class BertForPreTraining(BertScriptableForPreTraining): - def __init__(self, config: BertConfig): - super().__init__(config) - self.config = config - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - def forward(self, - input_ids = None, - attention_mask = None, - token_type_ids = None, - position_ids = None, - head_mask = None, - inputs_embeds = None, - labels = None, - next_sentence_label = None, - output_attentions =None, - output_hidden_states = None, - **kwargs): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. @@ -977,6 +933,46 @@ def forward(self, >>> prediction_scores, seq_relationship_scores = outputs[:2] """ + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output, pooled_output, hidden_states, attentions = outputs + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss: Optional[torch.Tensor] = None + if labels is not None and next_sentence_label is not None: + masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1)) + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions) + +class BertForPreTraining(BertScriptableForPreTraining): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + def forward(self, + input_ids = None, + attention_mask = None, + token_type_ids = None, + position_ids = None, + head_mask = None, + inputs_embeds = None, + labels = None, + next_sentence_label = None, + output_attentions =None, + output_hidden_states = None, + **kwargs): if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", @@ -1219,6 +1215,9 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ return {"input_ids": input_ids, "attention_mask": attention_mask} +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, +) class BertScriptableForNextSentencePrediction(BertPreTrainedModel): """ Torch-Script-able version of BertForNextSentencePrediction, below @@ -1243,38 +1242,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - pooled_output = outputs[1] - hidden_states, attentions = outputs[2], outputs[3] - - seq_relationship_score = self.cls(pooled_output) - - next_sentence_loss : Optional[torch.Tensor] = None - if next_sentence_label is not None: - next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - - return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) - -@add_start_docstrings( - """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, -) -class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): - def __init__(self, config: BertConfig): - super().__init__(config) - self.config = config - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - def forward(self, *args, **kwargs): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) @@ -1315,8 +1282,43 @@ def forward(self, *args, **kwargs): >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + pooled_output = outputs[1] + hidden_states, attentions = outputs[2], outputs[3] + + seq_relationship_score = self.cls(pooled_output) + + next_sentence_loss : Optional[torch.Tensor] = None + if next_sentence_label is not None: + next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + + return (next_sentence_loss, seq_relationship_score, hidden_states, attentions) + +class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction): + def __init__(self, config: BertConfig): + super().__init__(config) + self.config = config + + @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + def forward(self, *args, **kwargs): + return filter_outputs(super(), *args, **kwargs) +@add_start_docstrings( + """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, +) class BertScriptableForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1340,7 +1342,31 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1367,11 +1393,6 @@ def forward( return (loss, logits, hidden_states, attentions) -@add_start_docstrings( - """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of - the pooled output) e.g. for GLUE tasks. """, - BERT_START_DOCSTRING, -) class BertForSequenceClassification(BertScriptableForSequenceClassification): def __init__(self, config: BertConfig): super().__init__(config) @@ -1380,33 +1401,13 @@ def __init__(self, config: BertConfig): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the sequence classification/regression loss. - Indices should be in :obj:`[0, ..., config.num_labels - 1]`. - If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): - Classification (or regression if config.num_labels==1) loss. - logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ return filter_outputs(super(), *args, **kwargs) +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, +) class BertScriptableForMultipleChoice(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1429,7 +1430,32 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ if input_ids is not None: num_choices = input_ids.shape[1] else: @@ -1469,11 +1495,6 @@ def forward( return (loss, reshaped_logits, hidden_states, attentions) -@add_start_docstrings( - """Bert Model with a multiple choice classification head on top (a linear layer on top of - the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - BERT_START_DOCSTRING, -) class BertForMultipleChoice(BertScriptableForMultipleChoice): def __init__(self, config: BertConfig): super().__init__(config) @@ -1482,34 +1503,13 @@ def __init__(self, config: BertConfig): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): - Classification loss. - classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): - `num_choices` is the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ return filter_outputs(super(), *args, **kwargs) +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, +) class BertScriptableForTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1534,7 +1534,29 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1567,11 +1589,6 @@ def forward( return (loss, logits, hidden_states, attentions) -@add_start_docstrings( - """Bert Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - BERT_START_DOCSTRING, -) class BertForTokenClassification(BertScriptableForTokenClassification): def __init__(self, config: BertConfig): super().__init__(config) @@ -1580,31 +1597,13 @@ def __init__(self, config: BertConfig): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : - Classification loss. - scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ return filter_outputs(super(), *args, **kwargs) +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, +) class BertScriptableForQuestionAnswering(BertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1628,7 +1627,36 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] : + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ outputs = self.bert( input_ids, attention_mask=attention_mask, @@ -1666,11 +1694,6 @@ def forward( return (total_loss, start_logits, end_logits, hidden_states, attentions) -@add_start_docstrings( - """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - BERT_START_DOCSTRING, -) class BertForQuestionAnswering(BertScriptableForQuestionAnswering): def __init__(self, config: BertConfig): super().__init__(config) @@ -1679,34 +1702,4 @@ def __init__(self, config: BertConfig): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased") def forward(self, *args, **kwargs): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ return filter_outputs(super(), *args, **kwargs) From 5e61455ab9dc1b5002f5003ae6e3e7188f461df6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 2 Sep 2020 17:30:41 +0200 Subject: [PATCH 09/11] benchmark torchscript bert --- benchmark_pytorch_scripting.py | 56 +++++++++++++++++++++++++ src/transformers/__init__.py | 1 + src/transformers/benchmark/benchmark.py | 16 ++----- 3 files changed, 60 insertions(+), 13 deletions(-) create mode 100755 benchmark_pytorch_scripting.py diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py new file mode 100755 index 00000000000000..ca522d1d5aa46d --- /dev/null +++ b/benchmark_pytorch_scripting.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +from transformers.modeling_bert import BertScriptableModel +from transformers import BertConfig, BertModel, PyTorchBenchmark, PyTorchBenchmarkArguments +import torch + + +def get_model(torchscript=False, device="cpu", config=None): + if not torchscript: + return BertModel(config).to(device).eval() + model = BertScriptableModel(config).to(device).eval() + return torch.jit.script(model) + + +def get_input_ids(input_tensor_type="single_tensor", config=None, batch_size=None, sequence_length=None, device="cpu"): + if input_tensor_type == "single_tensor": + return [torch.randint(config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=device)] + elif input_tensor_type == "batched_tensors": + num_batches = batch_size // 8 + sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(num_batches)] + print("Seq Length", sequence_lengths) + return [torch.randint(config.vocab_size, (10, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths] + elif input_tensor_type == "multiple_tensors": + sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(batch_size)] + print("Seq Length", sequence_lengths) + return [torch.randint(config.vocab_size, (1, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths] + else: + raise ValueError(f"{input_tensor_type} does not exist.") + + +def get_inference_func(device, config, sequence_length, batch_size, input_tensor_type, torchscript): + model = get_model(torchscript, device, config) + input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device) + + def func(): + for inputs in input_ids: + result = model(inputs) + return result + + return func + + +def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple_tensors", torchscript=True): + config = BertConfig.from_pretrained("bert-base-uncased") + args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True) + device = args.device + benchmark = PyTorchBenchmark(args, configs=[config]) + + def _prepare_inference_func(model_name, batch_size, sequence_length): + return get_inference_func(device=device, config=config, sequence_length=sequence_length, batch_size=batch_size, input_tensor_type=input_tensor_type, torchscript=torchscript) + + benchmark._prepare_inference_func = _prepare_inference_func + benchmark.run() + + +run_benchmark([10], [10]) +run_benchmark([10], [10], torchscript=False) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e7aaed67189235..aa2612b2e6db00 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -211,6 +211,7 @@ from .modeling_bert import ( BertPreTrainedModel, + BertScriptableModel, BertModel, BertForPreTraining, BertForMaskedLM, diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py index 37d2e26aaf1b41..2ba259e152cec8 100644 --- a/src/transformers/benchmark/benchmark.py +++ b/src/transformers/benchmark/benchmark.py @@ -86,19 +86,9 @@ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_len config = self.config_dict[model_name] if self.args.torchscript: + from transformers.modeling_bert import BertScriptableModel config.torchscript = True - - has_model_class_in_config = hasattr(config, "architecture") and len(config.architectures) > 1 - if not self.args.only_pretrain_model and has_model_class_in_config: - try: - model_class = config.architectures[0] - transformers_module = __import__("transformers", fromlist=[model_class]) - model_cls = getattr(transformers_module, model_class) - model = model_cls(config) - except ImportError: - raise ImportError( - f"{model_class} does not exist. If you just want to test the pretrained model, you might want to set `--only_pretrain_model` or `args.only_pretrain_model=True`." - ) + model = BertScriptableModel(config) else: model = MODEL_MAPPING[config.__class__](config) @@ -118,7 +108,7 @@ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_len if self.args.torchscript: with torch.no_grad(): - inference_model = torch.jit.trace(model, input_ids) + inference_model = torch.jit.script(model) else: inference_model = model From f3b939cc19d3ccda33858f66fe981cc95b7e500b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 2 Sep 2020 17:26:40 +0000 Subject: [PATCH 10/11] update --- benchmark_pytorch_scripting.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py index ca522d1d5aa46d..c4bf3c66f3d718 100755 --- a/benchmark_pytorch_scripting.py +++ b/benchmark_pytorch_scripting.py @@ -11,17 +11,15 @@ def get_model(torchscript=False, device="cpu", config=None): return torch.jit.script(model) -def get_input_ids(input_tensor_type="single_tensor", config=None, batch_size=None, sequence_length=None, device="cpu"): - if input_tensor_type == "single_tensor": +def get_input_ids(input_tensor_type="single", config=None, batch_size=None, sequence_length=None, device="cpu"): + if input_tensor_type == "single": return [torch.randint(config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=device)] - elif input_tensor_type == "batched_tensors": + elif input_tensor_type == "batched": num_batches = batch_size // 8 sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(num_batches)] - print("Seq Length", sequence_lengths) return [torch.randint(config.vocab_size, (10, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths] - elif input_tensor_type == "multiple_tensors": + elif input_tensor_type == "multiple": sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(batch_size)] - print("Seq Length", sequence_lengths) return [torch.randint(config.vocab_size, (1, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths] else: raise ValueError(f"{input_tensor_type} does not exist.") @@ -31,17 +29,17 @@ def get_inference_func(device, config, sequence_length, batch_size, input_tensor model = get_model(torchscript, device, config) input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device) + @torch.no_grad() def func(): - for inputs in input_ids: - result = model(inputs) + for i in input_ids: + result = model(i) return result - return func -def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple_tensors", torchscript=True): +def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple", torchscript=True): config = BertConfig.from_pretrained("bert-base-uncased") - args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True) + args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True, repeat=1, torchscript=True, no_env_print=True) device = args.device benchmark = PyTorchBenchmark(args, configs=[config]) @@ -52,5 +50,12 @@ def _prepare_inference_func(model_name, batch_size, sequence_length): benchmark.run() -run_benchmark([10], [10]) -run_benchmark([10], [10], torchscript=False) +torch.manual_seed(0) +run_benchmark([500, 2500], [128, 512]) +torch.manual_seed(0) +run_benchmark([500, 2500], [128, 512], torchscript=False) + +torch.manual_seed(0) +run_benchmark([512, 4096], [128, 512], input_tensor_type="batched") +torch.manual_seed(0) +run_benchmark([512, 4096], [128, 512], torchscript=False, input_tensor_type="batched") From 132600b9d89d81fae1c8238a3babda12e38a5198 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 3 Sep 2020 17:02:18 +0000 Subject: [PATCH 11/11] correct benchmark --- benchmark_pytorch_scripting.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py index c4bf3c66f3d718..cb916edfb16478 100755 --- a/benchmark_pytorch_scripting.py +++ b/benchmark_pytorch_scripting.py @@ -4,9 +4,12 @@ import torch -def get_model(torchscript=False, device="cpu", config=None): +def get_model(torchscript=False, device="cpu", config=None, max_seq_length=None): + input_ids = torch.ones((1, max_seq_length), device=device, dtype=torch.long) if not torchscript: - return BertModel(config).to(device).eval() + model = BertModel(config).to(device).eval() + traced_model = torch.jit.trace(model, input_ids) + return traced_model model = BertScriptableModel(config).to(device).eval() return torch.jit.script(model) @@ -26,7 +29,7 @@ def get_input_ids(input_tensor_type="single", config=None, batch_size=None, sequ def get_inference_func(device, config, sequence_length, batch_size, input_tensor_type, torchscript): - model = get_model(torchscript, device, config) + model = get_model(torchscript, device, config, sequence_length) input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device) @torch.no_grad()