From 5a05758f5bd316a832387c4b6969f443bc56bf7d Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Thu, 2 Jul 2020 16:18:40 -0400
Subject: [PATCH 01/11] encoder

---
 src/transformers/modeling_bert.py | 101 +++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 36 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 23d25cfa09847a..1354bf9d57e1d8 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -19,6 +19,7 @@
 import logging
 import math
 import os
+from typing import List, Optional, Tuple
 import warnings
 
 import torch
@@ -141,6 +142,12 @@ def mish(x):
 
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
 
+def act_fn(name: str, input: torch.Tensor) -> torch.Tensor:
+    if name == "gelu": return gelu(input)
+    if name == "relu": return torch.nn.functional.relu(input)
+    if name == "swish": return swish(input)
+    if name == "gelu_new": return gelu_new(input)
+    else: raise KeyError("Unknown activation function '" + name + "'!")
 
 BertLayerNorm = torch.nn.LayerNorm
 
@@ -160,14 +167,21 @@ def __init__(self, config):
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+    def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None):
         if input_ids is not None:
             input_shape = input_ids.size()
         else:
+            assert inputs_embeds is not None
             input_shape = inputs_embeds.size()[:-1]
 
         seq_length = input_shape[1]
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if input_ids is not None:
+            device = input_ids.device
+        else:
+            assert inputs_embeds is not None
+            device = inputs_embeds.device
+
         if position_ids is None:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
             position_ids = position_ids.unsqueeze(0).expand(input_shape)
@@ -175,7 +189,9 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
         if inputs_embeds is None:
+            assert input_ids is not None
             inputs_embeds = self.word_embeddings(input_ids)
+
         position_embeddings = self.position_embeddings(position_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
@@ -206,18 +222,19 @@ def __init__(self, config):
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
+        x = x.view(new_x_shape[0], new_x_shape[1],
+                   new_x_shape[2], new_x_shape[3])
         return x.permute(0, 2, 1, 3)
 
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_attentions=False,
-    ):
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,    
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -243,7 +260,7 @@ def forward(
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -257,9 +274,11 @@ def forward(
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
+        context_layer = context_layer.view(new_context_layer_shape[0],
+                                           new_context_layer_shape[1],
+                                           new_context_layer_shape[2])
 
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer, None)
         return outputs
 
 
@@ -270,7 +289,7 @@ def __init__(self, config):
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, hidden_states, input_tensor):
+    def forward(self, hidden_states, input_tensor) -> torch.Tensor :
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
@@ -305,17 +324,17 @@ def prune_heads(self, heads):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_attentions=False,
-    ):
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,           
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         self_outputs = self.self(
             hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
         )
         attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output, self_outputs[1])  # add attentions if we output them
         return outputs
 
 
@@ -323,14 +342,11 @@ class BertIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
+        self.act_fn_name = config.hidden_act
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = act_fn(self.act_fn_name, hidden_states)
         return hidden_states
 
 
@@ -347,12 +363,22 @@ def forward(self, hidden_states, input_tensor):
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
+class DummyModule(nn.Module):
+    """
+    Dummy Module as a placeholder for cross attention Module.
+    Used to appease TorchScript in BertLayers which are not decoders.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, a, b: Optional[torch.Tensor], c: Optional[torch.Tensor], d: Optional[torch.Tensor], e: Optional[torch.Tensor]):
+        return torch.zeros([0])
 
 class BertLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.attention = BertAttention(config)
         self.is_decoder = config.is_decoder
+        self.crossattention = DummyModule() # dummy module
         if self.is_decoder:
             self.crossattention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
@@ -361,18 +387,22 @@ def __init__(self, config):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_attentions=False,
-    ):
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Returns the output hidden states, the self-attention weights (if returned),
+        and the cross-attention weights (if the layer is a decoder layer, and weights are returned).
+        """
         self_attention_outputs = self.attention(
             hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
         )
         attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
+        self_attention_weights = self_attention_outputs[1]
+        cross_attention_weights : Optional[torch.Tensor] = None
         if self.is_decoder and encoder_hidden_states is not None:
             cross_attention_outputs = self.crossattention(
                 attention_output,
@@ -383,14 +413,13 @@ def forward(
                 output_attentions,
             )
             attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+            cross_attention_weights = cross_attention_outputs[1]
 
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + outputs
+        outputs = (layer_output, self_attention_weights, cross_attention_weights)
         return outputs
 
-
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()

From 2a07ae543ac4238c130c36f8a6e770dd576f9a16 Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Thu, 2 Jul 2020 17:28:58 -0400
Subject: [PATCH 02/11] encoder - checkpoint

---
 src/transformers/modeling_bert.py | 71 +++++++++++++++++--------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 1354bf9d57e1d8..8f76bca0956541 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -278,7 +278,7 @@ def forward(
                                            new_context_layer_shape[1],
                                            new_context_layer_shape[2])
 
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer, None)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None)
         return outputs
 
 
@@ -370,7 +370,12 @@ class DummyModule(nn.Module):
     """
     def __init__(self):
         super().__init__()
-    def forward(self, a, b: Optional[torch.Tensor], c: Optional[torch.Tensor], d: Optional[torch.Tensor], e: Optional[torch.Tensor]):
+    def forward(self, a,
+                b: Optional[torch.Tensor],
+                c: Optional[torch.Tensor],
+                d: Optional[torch.Tensor],
+                e: Optional[torch.Tensor],
+                f: bool):
         return torch.zeros([0])
 
 class BertLayer(nn.Module):
@@ -429,19 +434,22 @@ def __init__(self, config):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_attentions=False,
-        output_hidden_states=False,
-    ):
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        all_hidden_states = []
+        all_attentions = []
+        i = 0
+        for layer_module in self.layer:
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
+                all_hidden_states = all_hidden_states.append(hidden_states)
+            layer_head_mask = None if head_mask is None else head_mask[i]
+            
+            """    
             if getattr(self.config, "gradient_checkpointing", False):
 
                 def create_custom_forward(module):
@@ -454,34 +462,35 @@ def custom_forward(*inputs):
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
-                    head_mask[i],
+                    layer_head_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
                 )
             else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    output_attentions,
-                )
+            """
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions,
+            )
             hidden_states = layer_outputs[0]
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
+                attn = layer_outputs[1]
+                assert attn is not None
+                all_attentions.append(attn)
+            i += 1
 
         # Add last layer
         if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+            all_hidden_states.append(hidden_states)
 
-        outputs = (hidden_states,)
-        if output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+        return (hidden_states,
+                all_hidden_states if output_hidden_states else None,
+                all_attentions if output_attentions else None)
 
 
 class BertPooler(nn.Module):

From d285de57d82c5b436eedba96f4f9956050fcf1db Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Mon, 20 Jul 2020 10:58:31 -0400
Subject: [PATCH 03/11] stage 1: BertModel is scriptable and tests pass

---
 src/transformers/modeling_bert.py  | 119 ++++++++++++++++++++---------
 src/transformers/modeling_utils.py |  56 +++++++++-----
 2 files changed, 121 insertions(+), 54 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 8f76bca0956541..2bc6e45af057c5 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -151,6 +151,8 @@ def act_fn(name: str, input: torch.Tensor) -> torch.Tensor:
 
 BertLayerNorm = torch.nn.LayerNorm
 
+def filter_outputs(module: nn.Module, *args, **kwargs):
+    return tuple(x for x in module.forward(*args, **kwargs) if x is not None)
 
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
@@ -167,7 +169,11 @@ def __init__(self, config):
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None):
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                token_type_ids: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None):
         if input_ids is not None:
             input_shape = input_ids.size()
         else:
@@ -201,7 +207,7 @@ def forward(self, input_ids: Optional[torch.Tensor] = None, token_type_ids: Opti
         return embeddings
 
 
-class BertSelfAttention(nn.Module):
+class BertScriptableSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -228,7 +234,7 @@ def transpose_for_scores(self, x):
 
     def forward(
         self,
-        hidden_states,
+        hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
@@ -281,6 +287,11 @@ def forward(
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer, None)
         return outputs
 
+class BertSelfAttention(BertScriptableSelfAttention):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 class BertSelfOutput(nn.Module):
     def __init__(self, config):
@@ -296,10 +307,10 @@ def forward(self, hidden_states, input_tensor) -> torch.Tensor :
         return hidden_states
 
 
-class BertAttention(nn.Module):
+class BertScriptableAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.self = BertSelfAttention(config)
+        self.self = BertScriptableSelfAttention(config)
         self.output = BertSelfOutput(config)
         self.pruned_heads = set()
 
@@ -337,6 +348,11 @@ def forward(
         outputs = (attention_output, self_outputs[1])  # add attentions if we output them
         return outputs
 
+class BertAttention(BertScriptableAttention):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 class BertIntermediate(nn.Module):
     def __init__(self, config):
@@ -378,14 +394,14 @@ def forward(self, a,
                 f: bool):
         return torch.zeros([0])
 
-class BertLayer(nn.Module):
+class BertScriptableLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.attention = BertAttention(config)
+        self.attention = BertScriptableAttention(config)
         self.is_decoder = config.is_decoder
         self.crossattention = DummyModule() # dummy module
         if self.is_decoder:
-            self.crossattention = BertAttention(config)
+            self.crossattention = BertScriptableAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
@@ -425,15 +441,21 @@ def forward(
         outputs = (layer_output, self_attention_weights, cross_attention_weights)
         return outputs
 
-class BertEncoder(nn.Module):
+class BertLayer(BertScriptableLayer):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
+
+class BertScriptableEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([BertScriptableLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(
         self,
-        hidden_states,
+        hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
@@ -446,7 +468,7 @@ def forward(
         i = 0
         for layer_module in self.layer:
             if output_hidden_states:
-                all_hidden_states = all_hidden_states.append(hidden_states)
+                all_hidden_states.append(hidden_states)
             layer_head_mask = None if head_mask is None else head_mask[i]
             
             """    
@@ -492,6 +514,11 @@ def custom_forward(*inputs):
                 all_hidden_states if output_hidden_states else None,
                 all_attentions if output_attentions else None)
 
+class BertEncoder(BertScriptableEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 class BertPooler(nn.Module):
     def __init__(self, config):
@@ -662,7 +689,7 @@ def _init_weights(self, module):
     "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
     BERT_START_DOCSTRING,
 )
-class BertModel(BertPreTrainedModel):
+class BertScriptableModel(BertPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well
@@ -679,16 +706,25 @@ class BertModel(BertPreTrainedModel):
 
     """
 
-    def __init__(self, config):
+    def __init__(self, config: BertConfig):
         super().__init__(config)
-        self.config = config
-
+        
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+        self.num_hidden_layers = config.num_hidden_layers
+        
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.encoder = BertScriptableEncoder(config)
         self.pooler = BertPooler(config)
-
+        # TorchScript cannot use parameters().next(), so get dtype explicitly.
+        self.mask_dtype = self.embeddings.word_embeddings(torch.LongTensor([0])).dtype
+    
         self.init_weights()
 
+    def get_dtype(self):
+        return self.mask_dtype
+    
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -707,16 +743,16 @@ def _prune_heads(self, heads_to_prune):
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
     ):
         r"""
     Return:
@@ -744,9 +780,9 @@ def forward(
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states if output_hidden_states is not None else self.output_hidden_states
         )
 
         if input_ids is not None and inputs_embeds is not None:
@@ -758,7 +794,11 @@ def forward(
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if input_ids is not None:
+            device = input_ids.device
+        else:
+            assert inputs_embeds is not None
+            device = inputs_embeds.device
 
         if attention_mask is None:
             attention_mask = torch.ones(input_shape, device=device)
@@ -767,11 +807,11 @@ def forward(
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, (input_shape[0], input_shape[1]), device)
 
         # If a 2D ou 3D attention mask is provided for the cross-attention
         # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
+        if self.is_decoder and encoder_hidden_states is not None:
             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
@@ -785,7 +825,7 @@ def forward(
         # attention_probs has shape bsz x n_heads x N x N
         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        head_mask = self.get_scriptable_head_mask(head_mask, self.num_hidden_layers)
 
         embedding_output = self.embeddings(
             input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
@@ -802,12 +842,17 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output, encoder_outputs[1], encoder_outputs[2])
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
+class BertModel(BertScriptableModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return tuple(x for x in super().forward(*args, **kwargs) if x is not None)
+
 @add_start_docstrings(
     """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
     a `next sentence prediction (classification)` head. """,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5b000c8125d17c..d4b62f14fbd59a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -141,8 +141,8 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
             first_tuple = next(gen)
             return first_tuple[1].device
 
-    @property
-    def dtype(self) -> dtype:
+    # TorchScript does not support, so add non-property option
+    def get_dtype(self) -> dtype:
         """
         Get torch.dtype from module, assuming that the whole module has one dtype.
         """
@@ -158,34 +158,40 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
             gen = self._named_members(get_members_fn=find_tensor_attributes)
             first_tuple = next(gen)
             return first_tuple[1].dtype
+    
+    @property
+    def dtype(self):
+        return self.get_dtype()    
 
     def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
         """type: torch.Tensor -> torch.Tensor"""
+        encoder_extended_attention_mask: Optional[Tensor] = None
         if encoder_attention_mask.dim() == 3:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
         if encoder_attention_mask.dim() == 2:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        assert encoder_extended_attention_mask is not None    
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
         # /transformer/transformer_layers.py#L270
         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
         # encoder_extended_attention_mask.transpose(-1, -2))
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.get_dtype())  # fp16 compatibility
 
-        if self.dtype == torch.float16:
+        if self.get_dtype() == torch.float16:
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
-        elif self.dtype == torch.float32:
+        elif self.get_dtype() == torch.float32:
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
             raise ValueError(
                 "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
-                    self.dtype
+                    self.get_dtype()
                 )
             )
 
         return encoder_extended_attention_mask
 
-    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple, device: device) -> Tensor:
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int, int], device: device) -> Tensor:
         """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.
 
         Arguments:
@@ -204,7 +210,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple
             # Provided a padding mask of dimensions [batch_size, seq_length]
             # - if the model is a decoder, apply a causal mask in addition to the padding mask
             # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
+            if self.is_decoder:
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
@@ -225,7 +231,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=self.get_dtype())  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
 
@@ -241,16 +247,32 @@ def get_head_mask(self, head_mask: Tensor, num_hidden_layers: int, is_attention_
              Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
              or list with [None] for each layer
         """
-        if head_mask is not None:
+        if head_mask is None:
+            return [None] * num_hidden_layers
+        else:
+            return self.get_scriptable_head_mask(head_mask, num_hidden_layers, is_attention_chunked)
+
+    def get_scriptable_head_mask(self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False) -> Optional[Tensor]:
+        """
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        attention_probs has shape bsz x n_heads x N x N
+        Arguments:
+            head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads]
+            num_hidden_layers: int
+        Returns:
+             Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+             or None
+        """
+        if head_mask is None:
+            return None
+        else:    
             head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
             if is_attention_chunked is True:
                 head_mask = head_mask.unsqueeze(-1)
-        else:
-            head_mask = [None] * num_hidden_layers
-
-        return head_mask
-
-    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+            return head_mask    
+        
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers: int):
         """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
         if head_mask.dim() == 1:
             head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
@@ -258,7 +280,7 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
         elif head_mask.dim() == 2:
             head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
         assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(dtype=self.dtype)  # switch to fload if need + fp16 compatibility
+        head_mask = head_mask.to(dtype=self.get_dtype())  # switch to fload if need + fp16 compatibility
         return head_mask
 
 

From fe14b7efbc07eec2715e8ecfd8834abcbfccd791 Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Mon, 20 Jul 2020 14:32:11 -0400
Subject: [PATCH 04/11] all but BertForLM

---
 src/transformers/modeling_bert.py  | 319 +++++++++++++++++------------
 src/transformers/modeling_utils.py |   6 +-
 2 files changed, 190 insertions(+), 135 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 2bc6e45af057c5..0a98e1177051a8 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -851,20 +851,21 @@ def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
     def forward(self, *args, **kwargs):
-        return tuple(x for x in super().forward(*args, **kwargs) if x is not None)
+        return filter_outputs(super(), *args, **kwargs)
 
 @add_start_docstrings(
     """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
     a `next sentence prediction (classification)` head. """,
     BERT_START_DOCSTRING,
 )
-class BertForPreTraining(BertPreTrainedModel):
+class BertScriptableForPreTraining(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = BertModel(config)
+        self.bert = BertScriptableModel(config)
         self.cls = BertPreTrainingHeads(config)
-
+        self.vocab_size = config.vocab_size
+        
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -873,18 +874,18 @@ def get_output_embeddings(self):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        **kwargs
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] =None,
+        output_hidden_states: Optional[bool] = None,
+        ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+    
         r"""
         labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
             Labels for computing the masked language modeling loss.
@@ -935,14 +936,6 @@ def forward(
         >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                DeprecationWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
-
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -954,21 +947,52 @@ def forward(
             output_hidden_states=output_hidden_states,
         )
 
-        sequence_output, pooled_output = outputs[:2]
+        sequence_output, pooled_output, hidden_states, attentions = outputs
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[
-            2:
-        ]  # add hidden states and attention if they are here
-
+        total_loss: Optional[torch.Tensor] = None
         if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
-            outputs = (total_loss,) + outputs
+            
+        return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
 
-        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+class BertForPreTraining(BertScriptableForPreTraining):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+
+    def forward(self,
+        input_ids = None,
+        attention_mask = None,
+        token_type_ids = None,
+        position_ids = None,
+        head_mask = None,
+        inputs_embeds = None,
+        labels = None,
+        next_sentence_label = None,
+        output_attentions =None,
+        output_hidden_states = None,
+        **kwargs):
+        if "masked_lm_labels" in kwargs:
+            warnings.warn(
+                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                DeprecationWarning,
+            )
+            labels = kwargs.pop("masked_lm_labels")
+        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
+
+        return tuple(x for x in super().forward(input_ids,
+                                                attention_mask,
+                                                token_type_ids,
+                                                position_ids,
+                                                head_mask,
+                                                inputs_embeds,
+                                                labels,
+                                                next_sentence_label,
+                                                output_attentions,
+                                                output_hidden_states) if x is not None)
 
 
 @add_start_docstrings(
@@ -1196,11 +1220,11 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
 @add_start_docstrings(
     """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
 )
-class BertForNextSentencePrediction(BertPreTrainedModel):
+class BertScriptableForNextSentencePrediction(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = BertModel(config)
+        self.bert = BertScriptableModel(config)
         self.cls = BertOnlyNSPHead(config)
 
         self.init_weights()
@@ -1208,16 +1232,16 @@ def __init__(self, config):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
         r"""
         next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
@@ -1271,29 +1295,35 @@ def forward(
         )
 
         pooled_output = outputs[1]
+        hidden_states, attentions = outputs[2], outputs[3]
 
         seq_relationship_score = self.cls(pooled_output)
 
-        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        next_sentence_loss : Optional[torch.Tensor] = None
         if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            outputs = (next_sentence_loss,) + outputs
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+        
+        return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
 
-        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
+class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 @add_start_docstrings(
     """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING,
 )
-class BertForSequenceClassification(BertPreTrainedModel):
+class BertScriptableForSequenceClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.bert = BertModel(config)
+        self.bert = BertScriptableModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
@@ -1303,16 +1333,16 @@ def __init__(self, config):
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) ->  Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the sequence classification/regression loss.
@@ -1351,35 +1381,37 @@ def forward(
         )
 
         pooled_output = outputs[1]
-
+        hidden_states, attentions = outputs[2], outputs[3]
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
+        loss: Optional[torch.Tensor] = None
         if labels is not None:
             if self.num_labels == 1:
                 #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
+                loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1))
             else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
+                loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
+        
+        return (loss, logits, hidden_states, attentions)
 
+class BertForSequenceClassification(BertScriptableForSequenceClassification):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 @add_start_docstrings(
     """Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     BERT_START_DOCSTRING,
 )
-class BertForMultipleChoice(BertPreTrainedModel):
+class BertScriptableForMultipleChoice(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = BertModel(config)
+        self.bert = BertScriptableModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
@@ -1389,16 +1421,16 @@ def __init__(self, config):
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the multiple choice classification loss.
@@ -1425,7 +1457,11 @@ def forward(
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        if input_ids is not None:
+            num_choices = input_ids.shape[1]
+        else:
+            assert inputs_embeds is not None
+            num_choices = inputs_embeds.shape[1]
 
         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
@@ -1449,32 +1485,36 @@ def forward(
         )
 
         pooled_output = outputs[1]
-
+        hidden_states, attentions = outputs[2], outputs[3]
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, num_choices)
 
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
+        loss: Optional[torch.Tensor] = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+            loss = nn.functional.cross_entropy(reshaped_logits, labels)
+        
+        return  (loss, reshaped_logits, hidden_states, attentions)
 
+class BertForMultipleChoice(BertScriptableForMultipleChoice):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 @add_start_docstrings(
     """Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     BERT_START_DOCSTRING,
 )
-class BertForTokenClassification(BertPreTrainedModel):
+class BertScriptableForTokenClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
+        self.ignore_index = nn.CrossEntropyLoss().ignore_index
+        
+        self.bert = BertScriptableModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
@@ -1484,16 +1524,16 @@ def __init__(self, config):
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
             Labels for computing the token classification loss.
@@ -1530,39 +1570,44 @@ def forward(
         )
 
         sequence_output = outputs[0]
+        hidden_states, attentions = outputs[2], outputs[3]
 
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        loss: Optional[torch.Tensor] = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
             # Only keep active parts of the loss
             if attention_mask is not None:
                 active_loss = attention_mask.view(-1) == 1
                 active_logits = logits.view(-1, self.num_labels)
                 active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                    active_loss, labels.view(-1), torch.tensor(self.ignore_index).type_as(labels)
                 )
-                loss = loss_fct(active_logits, active_labels)
+                loss = nn.functional.cross_entropy(active_logits, active_labels)
             else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
+                loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
 
-        return outputs  # (loss), scores, (hidden_states), (attentions)
+        return (loss, logits, hidden_states, attentions)
 
+class BertForTokenClassification(BertScriptableForTokenClassification):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
 
 @add_start_docstrings(
     """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
     BERT_START_DOCSTRING,
 )
-class BertForQuestionAnswering(BertPreTrainedModel):
+class BertScriptableForQuestionAnswering(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.bert = BertModel(config)
+        self.bert = BertScriptableModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         self.init_weights()
@@ -1571,17 +1616,17 @@ def __init__(self, config):
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-    ):
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
         r"""
         start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1625,13 +1670,14 @@ def forward(
         )
 
         sequence_output = outputs[0]
-
+        hidden_states, attentions = outputs[2], outputs[3]
+        
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = (start_logits, end_logits,) + outputs[2:]
+        total_loss: Optional[torch.Tensor] = None
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -1643,10 +1689,15 @@ def forward(
             start_positions.clamp_(0, ignored_index)
             end_positions.clamp_(0, ignored_index)
 
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
+            start_loss = nn.functional.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
+            end_loss = nn.functional.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
             total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
+        
+        return (total_loss, start_logits, end_logits, hidden_states, attentions)
 
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+class BertForQuestionAnswering(BertScriptableForQuestionAnswering):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    def forward(self, *args, **kwargs):
+        return filter_outputs(super(), *args, **kwargs)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d4b62f14fbd59a..6cf0d38adea7f6 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -191,6 +191,10 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
 
         return encoder_extended_attention_mask
 
+    def get_is_decoder(self):
+        if hasattr(self, "is_decoder"): return self.is_decoder
+        else: return self.config.is_decoder
+
     def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int, int], device: device) -> Tensor:
         """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.
 
@@ -210,7 +214,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple
             # Provided a padding mask of dimensions [batch_size, seq_length]
             # - if the model is a decoder, apply a causal mask in addition to the padding mask
             # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.is_decoder:
+            if self.get_is_decoder():
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]

From 74c1bb8c666e728d1cb0bd244fa8873a52bc2e30 Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Mon, 20 Jul 2020 15:12:22 -0400
Subject: [PATCH 05/11] move comments

---
 src/transformers/modeling_bert.py | 543 +++++++++++++++---------------
 1 file changed, 276 insertions(+), 267 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 0a98e1177051a8..d5a547c6d09faa 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -684,26 +684,8 @@ def _init_weights(self, module):
             If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
 """
 
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
 class BertScriptableModel(BertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
-
-    .. _`Attention is all you need`:
-        https://arxiv.org/abs/1706.03762
-
+    """ Torch-Script-able version of BertModel, below.
     """
 
     def __init__(self, config: BertConfig):
@@ -739,8 +721,6 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -754,32 +734,6 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
-
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         output_attentions = output_attentions if output_attentions is not None else self.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.output_hidden_states
@@ -846,18 +800,61 @@ def forward(
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
 class BertModel(BertScriptableModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
+
+    .. _`Attention is all you need`:
+        https://arxiv.org/abs/1706.03762
+
+    """
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
+    
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         return filter_outputs(super(), *args, **kwargs)
 
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
-    a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
-)
 class BertScriptableForPreTraining(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -871,7 +868,6 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -886,6 +882,51 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
     
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output, pooled_output, hidden_states, attentions = outputs
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss: Optional[torch.Tensor] = None
+        if labels is not None and next_sentence_label is not None:
+            masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            
+        return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
+
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertScriptableForPreTraining):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(self,
+        input_ids = None,
+        attention_mask = None,
+        token_type_ids = None,
+        position_ids = None,
+        head_mask = None,
+        inputs_embeds = None,
+        labels = None,
+        next_sentence_label = None,
+        output_attentions =None,
+        output_hidden_states = None,
+        **kwargs):
         r"""
         labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
             Labels for computing the masked language modeling loss.
@@ -936,45 +977,6 @@ def forward(
         >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-
-        sequence_output, pooled_output, hidden_states, attentions = outputs
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        total_loss: Optional[torch.Tensor] = None
-        if labels is not None and next_sentence_label is not None:
-            masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
-            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            
-        return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
-
-class BertForPreTraining(BertScriptableForPreTraining):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        self.config = config
-
-    def forward(self,
-        input_ids = None,
-        attention_mask = None,
-        token_type_ids = None,
-        position_ids = None,
-        head_mask = None,
-        inputs_embeds = None,
-        labels = None,
-        next_sentence_label = None,
-        output_attentions =None,
-        output_hidden_states = None,
-        **kwargs):
         if "masked_lm_labels" in kwargs:
             warnings.warn(
                 "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
@@ -1217,10 +1219,10 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
 
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-)
 class BertScriptableForNextSentencePrediction(BertPreTrainedModel):
+    """
+    Torch-Script-able version of BertForNextSentencePrediction, below
+    """
     def __init__(self, config):
         super().__init__(config)
 
@@ -1229,7 +1231,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1242,6 +1243,38 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = outputs[1]
+        hidden_states, attentions = outputs[2], outputs[3]
+
+        seq_relationship_score = self.cls(pooled_output)
+
+        next_sentence_loss : Optional[torch.Tensor] = None
+        if next_sentence_label is not None:
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+        
+        return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+    
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(self, *args, **kwargs):
         r"""
         next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
@@ -1282,7 +1315,32 @@ def forward(
         >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
         >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
         """
+        return filter_outputs(super(), *args, **kwargs)
 
+class BertScriptableForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertScriptableModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) ->  Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1296,53 +1354,32 @@ def forward(
 
         pooled_output = outputs[1]
         hidden_states, attentions = outputs[2], outputs[3]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
 
-        seq_relationship_score = self.cls(pooled_output)
-
-        next_sentence_loss : Optional[torch.Tensor] = None
-        if next_sentence_label is not None:
-            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-        
-        return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
-
-
-class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        self.config = config
-    def forward(self, *args, **kwargs):
-        return filter_outputs(super(), *args, **kwargs)
+        loss: Optional[torch.Tensor] = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1))
+            else:
+                loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
+        
+        return (loss, logits, hidden_states, attentions)
 
 @add_start_docstrings(
     """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING,
 )
-class BertScriptableForSequenceClassification(BertPreTrainedModel):
-    def __init__(self, config):
+class BertForSequenceClassification(BertScriptableForSequenceClassification):
+    def __init__(self, config: BertConfig):
         super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertScriptableModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
+        self.config = config
+    
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-    ) ->  Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+    def forward(self, *args, **kwargs):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the sequence classification/regression loss.
@@ -1368,45 +1405,8 @@ def forward(
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-
-        pooled_output = outputs[1]
-        hidden_states, attentions = outputs[2], outputs[3]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss: Optional[torch.Tensor] = None
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1))
-            else:
-                loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
-        
-        return (loss, logits, hidden_states, attentions)
-
-class BertForSequenceClassification(BertScriptableForSequenceClassification):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        self.config = config
-    def forward(self, *args, **kwargs):
         return filter_outputs(super(), *args, **kwargs)
 
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
-)
 class BertScriptableForMultipleChoice(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1417,8 +1417,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
-    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1431,32 +1429,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
+        
         if input_ids is not None:
             num_choices = input_ids.shape[1]
         else:
@@ -1496,18 +1469,47 @@ def forward(
         
         return  (loss, reshaped_logits, hidden_states, attentions)
 
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING,
+)
 class BertForMultipleChoice(BertScriptableForMultipleChoice):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
+    
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         return filter_outputs(super(), *args, **kwargs)
 
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
-)
 class BertScriptableForTokenClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1520,8 +1522,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1534,30 +1534,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
-
+        
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1590,18 +1567,44 @@ def forward(
 
         return (loss, logits, hidden_states, attentions)
 
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING,
+)
 class BertForTokenClassification(BertScriptableForTokenClassification):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
+    
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         return filter_outputs(super(), *args, **kwargs)
 
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
-)
 class BertScriptableForQuestionAnswering(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1612,8 +1615,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1627,37 +1628,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
-
+        
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1695,9 +1666,47 @@ def forward(
         
         return (total_loss, start_logits, end_logits, hidden_states, attentions)
 
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING,
+)
 class BertForQuestionAnswering(BertScriptableForQuestionAnswering):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
+    
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         return filter_outputs(super(), *args, **kwargs)

From f1504fa22162bbaccf7840f5b475f90bbf20d56b Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Mon, 20 Jul 2020 15:37:26 -0400
Subject: [PATCH 06/11] remove whitespace

---
 src/transformers/modeling_bert.py  | 48 +++++++++++++++---------------
 src/transformers/modeling_utils.py |  4 +--
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index d5a547c6d09faa..b10804283226ae 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -470,7 +470,7 @@ def forward(
             if output_hidden_states:
                 all_hidden_states.append(hidden_states)
             layer_head_mask = None if head_mask is None else head_mask[i]
-            
+
             """    
             if getattr(self.config, "gradient_checkpointing", False):
 
@@ -690,23 +690,23 @@ class BertScriptableModel(BertPreTrainedModel):
 
     def __init__(self, config: BertConfig):
         super().__init__(config)
-        
+
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
         self.is_decoder = config.is_decoder
         self.num_hidden_layers = config.num_hidden_layers
-        
+
         self.embeddings = BertEmbeddings(config)
         self.encoder = BertScriptableEncoder(config)
         self.pooler = BertPooler(config)
         # TorchScript cannot use parameters().next(), so get dtype explicitly.
         self.mask_dtype = self.embeddings.word_embeddings(torch.LongTensor([0])).dtype
-    
+
         self.init_weights()
 
     def get_dtype(self):
         return self.mask_dtype
-    
+
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -823,7 +823,7 @@ class BertModel(BertScriptableModel):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
@@ -862,7 +862,7 @@ def __init__(self, config):
         self.bert = BertScriptableModel(config)
         self.cls = BertPreTrainingHeads(config)
         self.vocab_size = config.vocab_size
-        
+
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -881,7 +881,7 @@ def forward(
         output_attentions: Optional[bool] =None,
         output_hidden_states: Optional[bool] = None,
         ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-    
+
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -901,7 +901,7 @@ def forward(
             masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
             next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
-            
+
         return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
 
 @add_start_docstrings(
@@ -1262,7 +1262,7 @@ def forward(
         next_sentence_loss : Optional[torch.Tensor] = None
         if next_sentence_label is not None:
             next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-        
+
         return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
 
 @add_start_docstrings(
@@ -1272,7 +1272,7 @@ class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(self, *args, **kwargs):
         r"""
@@ -1340,7 +1340,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) ->  Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        
+
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1364,7 +1364,7 @@ def forward(
                 loss = nn.functional.mse_loss(logits.view(-1), labels.view(-1))
             else:
                 loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
-        
+
         return (loss, logits, hidden_states, attentions)
 
 @add_start_docstrings(
@@ -1376,7 +1376,7 @@ class BertForSequenceClassification(BertScriptableForSequenceClassification):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
@@ -1429,7 +1429,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        
+
         if input_ids is not None:
             num_choices = input_ids.shape[1]
         else:
@@ -1466,7 +1466,7 @@ def forward(
         loss: Optional[torch.Tensor] = None
         if labels is not None:
             loss = nn.functional.cross_entropy(reshaped_logits, labels)
-        
+
         return  (loss, reshaped_logits, hidden_states, attentions)
 
 @add_start_docstrings(
@@ -1478,7 +1478,7 @@ class BertForMultipleChoice(BertScriptableForMultipleChoice):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
@@ -1515,7 +1515,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.ignore_index = nn.CrossEntropyLoss().ignore_index
-        
+
         self.bert = BertScriptableModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
@@ -1534,7 +1534,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        
+
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1576,7 +1576,7 @@ class BertForTokenClassification(BertScriptableForTokenClassification):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
@@ -1628,7 +1628,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        
+
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1642,7 +1642,7 @@ def forward(
 
         sequence_output = outputs[0]
         hidden_states, attentions = outputs[2], outputs[3]
-        
+
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
@@ -1663,7 +1663,7 @@ def forward(
             start_loss = nn.functional.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
             end_loss = nn.functional.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
             total_loss = (start_loss + end_loss) / 2
-        
+
         return (total_loss, start_logits, end_logits, hidden_states, attentions)
 
 @add_start_docstrings(
@@ -1675,7 +1675,7 @@ class BertForQuestionAnswering(BertScriptableForQuestionAnswering):
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
-    
+
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 6cf0d38adea7f6..e6e3f7e3b4d139 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -158,7 +158,7 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
             gen = self._named_members(get_members_fn=find_tensor_attributes)
             first_tuple = next(gen)
             return first_tuple[1].dtype
-    
+
     @property
     def dtype(self):
         return self.get_dtype()    
@@ -275,7 +275,7 @@ def get_scriptable_head_mask(self, head_mask: Optional[Tensor], num_hidden_layer
             if is_attention_chunked is True:
                 head_mask = head_mask.unsqueeze(-1)
             return head_mask    
-        
+
     def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers: int):
         """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
         if head_mask.dim() == 1:

From 9b360d7e1f5c1c6c6b9410d4b3c7d06442074e62 Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Wed, 22 Jul 2020 11:11:41 -0400
Subject: [PATCH 07/11] add scripting test

---
 tests/test_modeling_bert.py | 77 +++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index 8a689fb8434c90..648e4b2d820532 100644
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -525,6 +525,83 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
+    # Copied from test_modeling_common.test_torchscript, but using jit.script, not jit.trace
+    def test_full_torchscript(self):
+        import copy
+        import tempfile
+        import os
+
+        if is_torch_available():
+            import torch
+            from transformers.modeling_bert import (
+                    BertScriptableModel,
+                    BertScriptableForMultipleChoice,
+                    BertScriptableForNextSentencePrediction,
+                    BertScriptableForPreTraining,
+                    BertScriptableForQuestionAnswering,
+                    BertScriptableForSequenceClassification,
+                    BertScriptableForTokenClassification,
+            )
+        
+        config, unused_inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = copy.deepcopy(config)
+        for key in configs_no_init.__dict__.keys():
+            if "_range" in key or "_std" in key or "initializer_factor" in key:
+                setattr(configs_no_init, key, 1e-10)
+
+        scriptable_model_classes = (
+            BertScriptableModel,
+            BertScriptableForMultipleChoice,
+            BertScriptableForNextSentencePrediction,
+            BertScriptableForPreTraining,
+            BertScriptableForQuestionAnswering,
+            BertScriptableForSequenceClassification,
+            BertScriptableForTokenClassification,
+        )
+        for model_class in scriptable_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            
+            try:
+                scripted = torch.jit.script(model)
+            except RuntimeError:
+                self.fail("Couldn't script module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "scripted_model.pt")
+
+                try:
+                    torch.jit.save(scripted, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save scripted module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load scripted module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

From 9aae7cce89c55dc25de3af87ddc1ce17a5a525da Mon Sep 17 00:00:00 2001
From: Sam Brody <sbrody18@bloomberg.net>
Date: Wed, 22 Jul 2020 11:41:16 -0400
Subject: [PATCH 08/11] put comments back on scriptable for diff

---
 src/transformers/modeling_bert.py | 491 +++++++++++++++---------------
 1 file changed, 242 insertions(+), 249 deletions(-)

diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index b10804283226ae..f32230242db845 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -684,10 +684,26 @@ def _init_weights(self, module):
             If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
 """
 
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
 class BertScriptableModel(BertPreTrainedModel):
-    """ Torch-Script-able version of BertModel, below.
     """
 
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
+
+    .. _`Attention is all you need`:
+        https://arxiv.org/abs/1706.03762
+
+    """
     def __init__(self, config: BertConfig):
         super().__init__(config)
 
@@ -734,6 +750,32 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.output_hidden_states
@@ -800,26 +842,7 @@ def forward(
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class BertModel(BertScriptableModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
-
-    .. _`Attention is all you need`:
-        https://arxiv.org/abs/1706.03762
-
-    """
+class BertModel(BertScriptableModel):    
     def __init__(self, config: BertConfig):
         super().__init__(config)
         self.config = config
@@ -827,34 +850,13 @@ def __init__(self, config: BertConfig):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
-
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         return filter_outputs(super(), *args, **kwargs)
 
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING,
+)
 class BertScriptableForPreTraining(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -881,52 +883,6 @@ def forward(
         output_attentions: Optional[bool] =None,
         output_hidden_states: Optional[bool] = None,
         ) -> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-
-        sequence_output, pooled_output, hidden_states, attentions = outputs
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        total_loss: Optional[torch.Tensor] = None
-        if labels is not None and next_sentence_label is not None:
-            masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
-            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
-
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
-    a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
-)
-class BertForPreTraining(BertScriptableForPreTraining):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        self.config = config
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def forward(self,
-        input_ids = None,
-        attention_mask = None,
-        token_type_ids = None,
-        position_ids = None,
-        head_mask = None,
-        inputs_embeds = None,
-        labels = None,
-        next_sentence_label = None,
-        output_attentions =None,
-        output_hidden_states = None,
-        **kwargs):
         r"""
         labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
             Labels for computing the masked language modeling loss.
@@ -977,6 +933,46 @@ def forward(self,
         >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output, pooled_output, hidden_states, attentions = outputs
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss: Optional[torch.Tensor] = None
+        if labels is not None and next_sentence_label is not None:
+            masked_lm_loss = nn.functional.cross_entropy(prediction_scores.view(-1, self.vocab_size), labels.view(-1))
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        return (total_loss, prediction_scores, seq_relationship_score, hidden_states, attentions)
+
+class BertForPreTraining(BertScriptableForPreTraining):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(self,
+        input_ids = None,
+        attention_mask = None,
+        token_type_ids = None,
+        position_ids = None,
+        head_mask = None,
+        inputs_embeds = None,
+        labels = None,
+        next_sentence_label = None,
+        output_attentions =None,
+        output_hidden_states = None,
+        **kwargs):
         if "masked_lm_labels" in kwargs:
             warnings.warn(
                 "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
@@ -1219,6 +1215,9 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
 
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+)
 class BertScriptableForNextSentencePrediction(BertPreTrainedModel):
     """
     Torch-Script-able version of BertForNextSentencePrediction, below
@@ -1243,38 +1242,6 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-
-        pooled_output = outputs[1]
-        hidden_states, attentions = outputs[2], outputs[3]
-
-        seq_relationship_score = self.cls(pooled_output)
-
-        next_sentence_loss : Optional[torch.Tensor] = None
-        if next_sentence_label is not None:
-            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-
-        return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-)
-class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
-    def __init__(self, config: BertConfig):
-        super().__init__(config)
-        self.config = config
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def forward(self, *args, **kwargs):
         r"""
         next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
@@ -1315,8 +1282,43 @@ def forward(self, *args, **kwargs):
         >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
         >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
         """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = outputs[1]
+        hidden_states, attentions = outputs[2], outputs[3]
+
+        seq_relationship_score = self.cls(pooled_output)
+
+        next_sentence_loss : Optional[torch.Tensor] = None
+        if next_sentence_label is not None:
+            next_sentence_loss = nn.functional.cross_entropy(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+
+        return  (next_sentence_loss, seq_relationship_score, hidden_states, attentions)
+
+class BertForNextSentencePrediction(BertScriptableForNextSentencePrediction):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.config = config
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(self, *args, **kwargs):
+        
         return filter_outputs(super(), *args, **kwargs)
 
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING,
+)
 class BertScriptableForSequenceClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1340,7 +1342,31 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) ->  Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1367,11 +1393,6 @@ def forward(
 
         return (loss, logits, hidden_states, attentions)
 
-@add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
-)
 class BertForSequenceClassification(BertScriptableForSequenceClassification):
     def __init__(self, config: BertConfig):
         super().__init__(config)
@@ -1380,33 +1401,13 @@ def __init__(self, config: BertConfig):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         return filter_outputs(super(), *args, **kwargs)
 
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING,
+)
 class BertScriptableForMultipleChoice(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1429,7 +1430,32 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
 
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         if input_ids is not None:
             num_choices = input_ids.shape[1]
         else:
@@ -1469,11 +1495,6 @@ def forward(
 
         return  (loss, reshaped_logits, hidden_states, attentions)
 
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
-)
 class BertForMultipleChoice(BertScriptableForMultipleChoice):
     def __init__(self, config: BertConfig):
         super().__init__(config)
@@ -1482,34 +1503,13 @@ def __init__(self, config: BertConfig):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         return filter_outputs(super(), *args, **kwargs)
 
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING,
+)
 class BertScriptableForTokenClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1534,7 +1534,29 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1567,11 +1589,6 @@ def forward(
 
         return (loss, logits, hidden_states, attentions)
 
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
-)
 class BertForTokenClassification(BertScriptableForTokenClassification):
     def __init__(self, config: BertConfig):
         super().__init__(config)
@@ -1580,31 +1597,13 @@ def __init__(self, config: BertConfig):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         return filter_outputs(super(), *args, **kwargs)
 
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING,
+)
 class BertScriptableForQuestionAnswering(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1628,7 +1627,36 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     )-> Tuple[Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]], Optional[List[torch.Tensor]]] :
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1666,11 +1694,6 @@ def forward(
 
         return (total_loss, start_logits, end_logits, hidden_states, attentions)
 
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
-)
 class BertForQuestionAnswering(BertScriptableForQuestionAnswering):
     def __init__(self, config: BertConfig):
         super().__init__(config)
@@ -1679,34 +1702,4 @@ def __init__(self, config: BertConfig):
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(self, *args, **kwargs):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        """
         return filter_outputs(super(), *args, **kwargs)

From 5e61455ab9dc1b5002f5003ae6e3e7188f461df6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 2 Sep 2020 17:30:41 +0200
Subject: [PATCH 09/11] benchmark torchscript bert

---
 benchmark_pytorch_scripting.py          | 56 +++++++++++++++++++++++++
 src/transformers/__init__.py            |  1 +
 src/transformers/benchmark/benchmark.py | 16 ++-----
 3 files changed, 60 insertions(+), 13 deletions(-)
 create mode 100755 benchmark_pytorch_scripting.py

diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py
new file mode 100755
index 00000000000000..ca522d1d5aa46d
--- /dev/null
+++ b/benchmark_pytorch_scripting.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+from transformers.modeling_bert import BertScriptableModel
+from transformers import BertConfig, BertModel, PyTorchBenchmark, PyTorchBenchmarkArguments
+import torch
+
+
+def get_model(torchscript=False, device="cpu", config=None):
+    if not torchscript:
+        return BertModel(config).to(device).eval()
+    model = BertScriptableModel(config).to(device).eval()
+    return torch.jit.script(model)
+
+
+def get_input_ids(input_tensor_type="single_tensor", config=None, batch_size=None, sequence_length=None, device="cpu"):
+    if input_tensor_type == "single_tensor":
+        return [torch.randint(config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=device)]
+    elif input_tensor_type == "batched_tensors":
+        num_batches = batch_size // 8
+        sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(num_batches)]
+        print("Seq Length", sequence_lengths)
+        return [torch.randint(config.vocab_size, (10, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths]
+    elif input_tensor_type == "multiple_tensors":
+        sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(batch_size)]
+        print("Seq Length", sequence_lengths)
+        return [torch.randint(config.vocab_size, (1, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths]
+    else:
+        raise ValueError(f"{input_tensor_type} does not exist.")
+
+
+def get_inference_func(device, config, sequence_length, batch_size, input_tensor_type, torchscript):
+    model = get_model(torchscript, device, config)
+    input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device)
+
+    def func():
+        for inputs in input_ids:
+            result = model(inputs)
+        return result
+
+    return func
+
+
+def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple_tensors", torchscript=True):
+    config = BertConfig.from_pretrained("bert-base-uncased")
+    args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True)
+    device = args.device
+    benchmark = PyTorchBenchmark(args, configs=[config])
+
+    def _prepare_inference_func(model_name, batch_size, sequence_length):
+        return get_inference_func(device=device, config=config, sequence_length=sequence_length, batch_size=batch_size, input_tensor_type=input_tensor_type, torchscript=torchscript)
+
+    benchmark._prepare_inference_func = _prepare_inference_func
+    benchmark.run()
+
+
+run_benchmark([10], [10])
+run_benchmark([10], [10], torchscript=False)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e7aaed67189235..aa2612b2e6db00 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -211,6 +211,7 @@
 
     from .modeling_bert import (
         BertPreTrainedModel,
+        BertScriptableModel,
         BertModel,
         BertForPreTraining,
         BertForMaskedLM,
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 37d2e26aaf1b41..2ba259e152cec8 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -86,19 +86,9 @@ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_len
         config = self.config_dict[model_name]
 
         if self.args.torchscript:
+            from transformers.modeling_bert import BertScriptableModel
             config.torchscript = True
-
-        has_model_class_in_config = hasattr(config, "architecture") and len(config.architectures) > 1
-        if not self.args.only_pretrain_model and has_model_class_in_config:
-            try:
-                model_class = config.architectures[0]
-                transformers_module = __import__("transformers", fromlist=[model_class])
-                model_cls = getattr(transformers_module, model_class)
-                model = model_cls(config)
-            except ImportError:
-                raise ImportError(
-                    f"{model_class} does not exist. If you just want to test the pretrained model, you might want to set `--only_pretrain_model` or `args.only_pretrain_model=True`."
-                )
+            model = BertScriptableModel(config)
         else:
             model = MODEL_MAPPING[config.__class__](config)
 
@@ -118,7 +108,7 @@ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_len
 
         if self.args.torchscript:
             with torch.no_grad():
-                inference_model = torch.jit.trace(model, input_ids)
+                inference_model = torch.jit.script(model)
         else:
             inference_model = model
 

From f3b939cc19d3ccda33858f66fe981cc95b7e500b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 2 Sep 2020 17:26:40 +0000
Subject: [PATCH 10/11] update

---
 benchmark_pytorch_scripting.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py
index ca522d1d5aa46d..c4bf3c66f3d718 100755
--- a/benchmark_pytorch_scripting.py
+++ b/benchmark_pytorch_scripting.py
@@ -11,17 +11,15 @@ def get_model(torchscript=False, device="cpu", config=None):
     return torch.jit.script(model)
 
 
-def get_input_ids(input_tensor_type="single_tensor", config=None, batch_size=None, sequence_length=None, device="cpu"):
-    if input_tensor_type == "single_tensor":
+def get_input_ids(input_tensor_type="single", config=None, batch_size=None, sequence_length=None, device="cpu"):
+    if input_tensor_type == "single":
         return [torch.randint(config.vocab_size, (batch_size, sequence_length), dtype=torch.long, device=device)]
-    elif input_tensor_type == "batched_tensors":
+    elif input_tensor_type == "batched":
         num_batches = batch_size // 8
         sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(num_batches)]
-        print("Seq Length", sequence_lengths)
         return [torch.randint(config.vocab_size, (10, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths]
-    elif input_tensor_type == "multiple_tensors":
+    elif input_tensor_type == "multiple":
         sequence_lengths = [torch.randint(1, sequence_length, (1,)).item() for i in range(batch_size)]
-        print("Seq Length", sequence_lengths)
         return [torch.randint(config.vocab_size, (1, sequence_length), dtype=torch.long, device=device) for sequence_length in sequence_lengths]
     else:
         raise ValueError(f"{input_tensor_type} does not exist.")
@@ -31,17 +29,17 @@ def get_inference_func(device, config, sequence_length, batch_size, input_tensor
     model = get_model(torchscript, device, config)
     input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device)
 
+    @torch.no_grad()
     def func():
-        for inputs in input_ids:
-            result = model(inputs)
+        for i in input_ids:
+            result = model(i)
         return result
-
     return func
 
 
-def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple_tensors", torchscript=True):
+def run_benchmark(batch_sizes, sequence_lengths, input_tensor_type="multiple", torchscript=True):
     config = BertConfig.from_pretrained("bert-base-uncased")
-    args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True)
+    args = PyTorchBenchmarkArguments(models=[f"Type: {input_tensor_type} - Script: {torchscript}"], no_memory=True, sequence_lengths=sequence_lengths, batch_sizes=batch_sizes, no_multi_process=True, repeat=1, torchscript=True, no_env_print=True)
     device = args.device
     benchmark = PyTorchBenchmark(args, configs=[config])
 
@@ -52,5 +50,12 @@ def _prepare_inference_func(model_name, batch_size, sequence_length):
     benchmark.run()
 
 
-run_benchmark([10], [10])
-run_benchmark([10], [10], torchscript=False)
+torch.manual_seed(0)
+run_benchmark([500, 2500], [128, 512])
+torch.manual_seed(0)
+run_benchmark([500, 2500], [128, 512], torchscript=False)
+
+torch.manual_seed(0)
+run_benchmark([512, 4096], [128, 512], input_tensor_type="batched")
+torch.manual_seed(0)
+run_benchmark([512, 4096], [128, 512], torchscript=False, input_tensor_type="batched")

From 132600b9d89d81fae1c8238a3babda12e38a5198 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Sep 2020 17:02:18 +0000
Subject: [PATCH 11/11] correct benchmark

---
 benchmark_pytorch_scripting.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmark_pytorch_scripting.py b/benchmark_pytorch_scripting.py
index c4bf3c66f3d718..cb916edfb16478 100755
--- a/benchmark_pytorch_scripting.py
+++ b/benchmark_pytorch_scripting.py
@@ -4,9 +4,12 @@
 import torch
 
 
-def get_model(torchscript=False, device="cpu", config=None):
+def get_model(torchscript=False, device="cpu", config=None, max_seq_length=None):
+    input_ids = torch.ones((1, max_seq_length), device=device, dtype=torch.long)
     if not torchscript:
-        return BertModel(config).to(device).eval()
+        model = BertModel(config).to(device).eval()
+        traced_model = torch.jit.trace(model, input_ids)
+        return traced_model
     model = BertScriptableModel(config).to(device).eval()
     return torch.jit.script(model)
 
@@ -26,7 +29,7 @@ def get_input_ids(input_tensor_type="single", config=None, batch_size=None, sequ
 
 
 def get_inference_func(device, config, sequence_length, batch_size, input_tensor_type, torchscript):
-    model = get_model(torchscript, device, config)
+    model = get_model(torchscript, device, config, sequence_length)
     input_ids = get_input_ids(input_tensor_type=input_tensor_type, config=config, batch_size=batch_size, sequence_length=sequence_length, device=device)
 
     @torch.no_grad()