huggingface · fabiopetroni · Nov 13, 2018
diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py
@@ -68,6 +68,21 @@ def convert():
  arrays.append(array)
 
  for name, array in zip(names, arrays):
+
+ # include the output_layer in the model
+ if (name=="bert/embeddings/word_embeddings"):
+ pointer = model
+ pointer = getattr(pointer, 'output_layer')
+ pointer = getattr(pointer, 'weight')
+ assert pointer.shape == array.shape
+ pointer.data = torch.from_numpy(array)
+ elif (name=="cls/predictions/output_bias"):
+ pointer = model
+ pointer = getattr(pointer, 'output_layer')
+ pointer = getattr(pointer, 'bias')
+ assert pointer.shape == array.shape
+ pointer.data = torch.from_numpy(array)
+
  if not name.startswith("bert"):
  print("Skipping {}".format(name))
  continue

diff --git a/modeling.py b/modeling.py
@@ -277,7 +277,7 @@ class BERTEncoder(nn.Module):
  def __init__(self, config):
  super(BERTEncoder, self).__init__()
  layer = BERTLayer(config)
- self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) 
+ self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
  def forward(self, hidden_states, attention_mask):
  all_encoder_layers = []
@@ -330,6 +330,10 @@ def __init__(self, config: BertConfig):
  self.encoder = BERTEncoder(config)
  self.pooler = BERTPooler(config)
 
+ # the output weights are the same as the input embeddings,
+ # but there is an output-only bias for each token
+ self.output_layer = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+
  def forward(self, input_ids, token_type_ids=None, attention_mask=None):
  if attention_mask is None:
  attention_mask = torch.ones_like(input_ids)