From 3d47a7f8ab8504a98fba8c8ed1397c28e68cca8d Mon Sep 17 00:00:00 2001
From: Rabeeh KARIMI <rabeeh.karimi@idiap.ch>
Date: Wed, 14 Aug 2019 10:58:26 +0200
Subject: [PATCH 001/119] loads the tokenizer for each checkpoint, to solve the
 reproducability issue

---
 examples/run_glue.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index f017db2f6f91de..c221b0eb7a3c55 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -463,6 +463,7 @@ def main():
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
+            tokenizer = tokenizer_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())

From 814a3f4e017020d67ce69048e2647e7bf5d0784f Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 14:11:14 -0400
Subject: [PATCH 002/119] Removed `attention_mask` from GPT-2 and GPT
 documentation. Corrected `multiple_choice_labels` to actual name `mc_labels`

---
 pytorch_transformers/modeling_gpt2.py   | 10 +---------
 pytorch_transformers/modeling_openai.py | 12 ++----------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 9022048d6d7531..2980cf269ac81e 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -408,10 +408,6 @@ def init_weights(self, module):
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `past` output below). Can be used to speed up sequential decoding.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -642,10 +638,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `past` output below). Can be used to speed up sequential decoding.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -656,7 +648,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
             Indices are selected in ``[-1, 0, ..., config.vocab_size]``
             All labels set to ``-1`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
-        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
             Labels for computing the multiple choice classification loss.
             Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
             of the input tensors. (see `input_ids` above)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index cfea7687368e10..690aa7812be16b 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -415,11 +415,7 @@ def init_weights(self, module):
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
             The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -626,10 +622,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
             The embeddings from these tokens will be summed with the respective token embeddings.
             Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -640,7 +632,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             Indices are selected in ``[-1, 0, ..., config.vocab_size]``
             All labels set to ``-1`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
-        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
             Labels for computing the multiple choice classification loss.
             Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
             of the input tensors. (see `input_ids` above)

From 034aa0c2d743f5de787b650d9ef38ea49c66df3f Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 17:27:38 -0400
Subject: [PATCH 003/119] Fixed GPT2DoubleHeadsModel example and weight tying

---
 pytorch_transformers/modeling_gpt2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 2980cf269ac81e..af8d3ad10f08d5 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -682,6 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
         tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
+        model.resize_token_embeddings(tokenizer.vocab_size + 1)
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
         mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0)  # Batch size 1
@@ -696,6 +697,7 @@ def __init__(self, config):
         self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
 
     def tie_weights(self):
         """ Make sure we are sharing the input and output embeddings.

From 47267ba556093e3cde815c655b14f9918df97dd3 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 17:50:16 -0400
Subject: [PATCH 004/119] OpenAI GPT-2 now depends on CommonTests.

---
 .../tests/modeling_common_test.py             |   3 +
 .../tests/modeling_gpt2_test.py               | 195 ++++++++++++++++--
 2 files changed, 182 insertions(+), 16 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index e974ae865d60f4..64a17f9db07d3a 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -210,6 +210,9 @@ def test_head_pruning(self):
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
             for model_class in self.all_model_classes:
                 config.output_attentions = True
                 config.output_hidden_states = False
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 4e32cc37e15aa6..fa8f79a40b0072 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -18,31 +18,194 @@
 
 import unittest
 import pytest
+import shutil
 
 
-from pytorch_transformers import (GPT2Config, GPT2Model,
+from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                   GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .modeling_common_test import CommonTestCases, ConfigTester
+from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
 
-class GPT2ModelTest(unittest.TestCase):
+
+class GPT2ModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
+
+    class GPT2ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2Model(config=config)
+            model.eval()
+
+            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+            model(input_ids, token_type_ids=token_type_ids)
+            sequence_output, presents = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "presents": presents,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertEqual(len(result["presents"]), config.n_layer)
+
+        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2LMHeadModel(config)
+            model.eval()
+
+            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2DoubleHeadsModel(config)
+            model.eval()
+
+            loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'head_mask': head_mask
+            }
+
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
 
-    def test_model(self):
-        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
-                                            lm_head_model_class=GPT2LMHeadModel,
-                                            double_head_model_class=GPT2DoubleHeadsModel)
-        model_tester.run_common_tests(test_presents=True)
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
 
-    @pytest.mark.slow
-    def test_pretrained(self):
-        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
-                                            lm_head_model_class=GPT2LMHeadModel,
-                                            double_head_model_class=GPT2DoubleHeadsModel)
-        model_tester.run_slow_tests()
+    def test_gpt2_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_gpt2_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
 if __name__ == "__main__":
     unittest.main()

From 55f69a11b638a54164d3e61921d60a8455b8066d Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 18:09:25 -0400
Subject: [PATCH 005/119] OpenAI GPT tests now extend CommonTests

---
 .../tests/modeling_gpt2_test.py               |   2 +
 .../tests/modeling_openai_test.py             | 195 ++++++++++++++++--
 2 files changed, 181 insertions(+), 16 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index fa8f79a40b0072..1786ada54cdb9a 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -207,5 +207,7 @@ def test_model_from_pretrained(self):
             model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index 243afb95019936..0fcb4b7d64ded6 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -18,31 +18,194 @@
 
 import unittest
 import pytest
+import shutil
 
 
-from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
-                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .modeling_common_test import CommonTestCases, ConfigTester
+from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
 
-class OpenAIModelTest(unittest.TestCase):
+
+class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+
+    class OpenAIGPTModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTModel(config=config)
+            model.eval()
+
+            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+            model(input_ids, token_type_ids=token_type_ids)
+            (sequence_output,) = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTLMHeadModel(config)
+            model.eval()
+
+            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTDoubleHeadsModel(config)
+            model.eval()
+
+            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'head_mask': head_mask
+            }
+
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
 
-    def test_model(self):
-        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
-                                           lm_head_model_class=OpenAIGPTLMHeadModel,
-                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
-        model_tester.run_common_tests(test_presents=False)
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+
+    def test_openai_gpt_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_openai_gpt_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
     @pytest.mark.slow
-    def test_pretrained(self):
-        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
-                                           lm_head_model_class=OpenAIGPTLMHeadModel,
-                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
-        model_tester.run_slow_tests()
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
 
 if __name__ == "__main__":
     unittest.main()

From e13465fb8bbabe3bbd528761818403aa5d2e128e Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Fri, 23 Aug 2019 12:12:12 +0200
Subject: [PATCH 006/119] change layernorm code to pytorch's native layer norm

---
 pytorch_transformers/modeling_bert.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 7b34b3fd9029f7..8bf281feb92372 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -224,20 +224,7 @@ def __init__(self,
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except (ImportError, AttributeError) as e:
     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-    class BertLayerNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-12):
-            """Construct a layernorm module in the TF style (epsilon inside the square root).
-            """
-            super(BertLayerNorm, self).__init__()
-            self.weight = nn.Parameter(torch.ones(hidden_size))
-            self.bias = nn.Parameter(torch.zeros(hidden_size))
-            self.variance_epsilon = eps
-
-        def forward(self, x):
-            u = x.mean(-1, keepdim=True)
-            s = (x - u).pow(2).mean(-1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-            return self.weight * x + self.bias
+    BertLayerNorm = torch.nn.LayerNorm
 
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.

From 436ce072183e3e134d2fbc286f6c72f012f31e74 Mon Sep 17 00:00:00 2001
From: Shijie Wu <wushijie5@gmail.com>
Date: Fri, 23 Aug 2019 14:40:17 -0400
Subject: [PATCH 007/119] Tokenization behave the same as original XLM
 proprocessing for most languages except zh, ja and th; Change API to allow
 specifying language in `tokenize`

---
 pytorch_transformers/tokenization_xlm.py | 148 ++++++++++++++++++++---
 requirements.txt                         |   4 +-
 setup.py                                 |   3 +-
 3 files changed, 135 insertions(+), 20 deletions(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 2d2f3a8cd4db64..8418a5d6f3bf53 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -20,8 +20,11 @@
 import logging
 import os
 import re
+import unicodedata
 from io import open
 
+import sacremoses as sm
+
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
@@ -95,6 +98,93 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
+
+def lowercase_and_remove_accent(text):
+    """
+    Lowercase and strips accents from a piece of text based on
+    https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
+    """
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat == "Mn":
+            continue
+        output.append(char)
+    return "".join(output).lower()
+
+
+def replace_unicode_punct(text):
+    '''
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    '''
+    text = text.replace('，', ',')
+    text = text.replace('。 *', '. ')
+    text = text.replace('、', ',')
+    text = text.replace('”', '"')
+    text = text.replace('“', '"')
+    text = text.replace('∶', ':')
+    text = text.replace('：', ':')
+    text = text.replace('？', '?')
+    text = text.replace('《', '"')
+    text = text.replace('》', '"')
+    text = text.replace('）', ')')
+    text = text.replace('！', '!')
+    text = text.replace('（', '(')
+    text = text.replace('；', ';')
+    text = text.replace('１', '"')
+    text = text.replace('」', '"')
+    text = text.replace('「', '"')
+    text = text.replace('０', '0')
+    text = text.replace('３', '3')
+    text = text.replace('２', '2')
+    text = text.replace('５', '5')
+    text = text.replace('６', '6')
+    text = text.replace('９', '9')
+    text = text.replace('７', '7')
+    text = text.replace('８', '8')
+    text = text.replace('４', '4')
+    text = re.sub(r'．\s*', '. ', text)
+    text = text.replace('～', '~')
+    text = text.replace('’', '\'')
+    text = text.replace('…', '...')
+    text = text.replace('━', '-')
+    text = text.replace('〈', '<')
+    text = text.replace('〉', '>')
+    text = text.replace('【', '[')
+    text = text.replace('】', ']')
+    text = text.replace('％', '%')
+    return text
+
+
+def remove_non_printing_char(text):
+    '''
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    '''
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith('C'):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+def romanian_preprocessing(text):
+    '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
+    text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
+    text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
+    text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+    text = text.replace("\u0102", "A").replace("\u0103", "a")
+    text = text.replace("\u00C2", "A").replace("\u00E2", "a")
+    text = text.replace("\u00CE", "I").replace("\u00EE", "i")
+    return text
+
+
 class XLMTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
@@ -122,16 +212,14 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                                            cls_token=cls_token, mask_token=mask_token,
                                            additional_special_tokens=additional_special_tokens,
                                            **kwargs)
-        try:
-            import ftfy
-            from spacy.lang.en import English
-            _nlp = English()
-            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
-            self.fix_text = None
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = dict()
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = dict()
+        self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
+        # True for current supported model (v1.2.0), False for XLM-17 & 100
+        self.do_lowercase_and_remove_accent = True
 
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
@@ -140,6 +228,28 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
     @property
     def vocab_size(self):
         return len(self.encoder)
@@ -187,19 +297,21 @@ def bpe(self, token):
         self.cache[token] = word
         return word
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, lang='en'):
         """ Tokenize a string. """
         split_tokens = []
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer
-            text = self.nlp.tokenize(text)
+        if self.do_lowercase_and_remove_accent:
+            text = lowercase_and_remove_accent(text)
+        if lang not in self.lang_with_custom_tokenizer:
+            text = self.moses_pipeline(text, lang=lang)
+            # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
+            if lang == 'ro':
+                text = romanian_preprocessing(text)
+            text = self.moses_tokenize(text, lang=lang)
             for token in text:
                 split_tokens.extend([t for t in self.bpe(token).split(' ')])
         else:
-            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
-            text = self.nlp(text_standardize(self.fix_text(text)))
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+            raise ValueError
         return split_tokens
 
     def _convert_token_to_id(self, token):
diff --git a/requirements.txt b/requirements.txt
index 76532d18a59258..01dca79d23bdba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ requests
 # For OpenAI GPT
 regex
 # For XLNet
-sentencepiece
\ No newline at end of file
+sentencepiece
+# For XLM
+sacremoses
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c9f80fc224af0e..297972226817f0 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,8 @@
                       'requests',
                       'tqdm',
                       'regex',
-                      'sentencepiece'],
+                      'sentencepiece',
+                      'sacremoses'],
     entry_points={
       'console_scripts': [
         "pytorch_transformers=pytorch_transformers.__main__:main",

From e85123d398bfc2e58f6f6539e524ee9c4619ec0d Mon Sep 17 00:00:00 2001
From: Shijie Wu <wushijie5@gmail.com>
Date: Fri, 23 Aug 2019 20:27:52 -0400
Subject: [PATCH 008/119] Add custom tokenizer for zh and ja

---
 pytorch_transformers/tokenization_xlm.py | 71 +++++++++++++++++-------
 requirements.txt                         |  6 +-
 setup.py                                 |  6 +-
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8418a5d6f3bf53..a459dea9b91d57 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -23,7 +23,11 @@
 import unicodedata
 from io import open
 
+import jieba
+import Mykytea
 import sacremoses as sm
+from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+from pythainlp.tokenize import word_tokenize as th_word_tokenize
 
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
@@ -83,21 +87,6 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
-def text_standardize(text):
-    """
-    fixes some issues the spacy tokenizer had on books corpus
-    also does some whitespace standardization
-    """
-    text = text.replace('—', '-')
-    text = text.replace('–', '-')
-    text = text.replace('―', '-')
-    text = text.replace('…', '...')
-    text = text.replace('´', "'")
-    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub(r'\s*\n\s*', ' \n ', text)
-    text = re.sub(r'[^\S\n]+', ' ', text)
-    return text.strip()
-
 
 def lowercase_and_remove_accent(text):
     """
@@ -120,7 +109,7 @@ def replace_unicode_punct(text):
     Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
     '''
     text = text.replace('，', ',')
-    text = text.replace('。 *', '. ')
+    text = re.sub(r'。\s*', '. ', text)
     text = text.replace('、', ',')
     text = text.replace('”', '"')
     text = text.replace('“', '"')
@@ -220,6 +209,8 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
         self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
         # True for current supported model (v1.2.0), False for XLM-17 & 100
         self.do_lowercase_and_remove_accent = True
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
 
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
@@ -250,6 +241,33 @@ def moses_pipeline(self, text, lang):
         text = remove_non_printing_char(text)
         return text
 
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
+            except RuntimeError:
+                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) with the following steps")
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                import sys; sys.exit()
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    def zh_tokenize(self, text):
+        if self.zh_word_tokenizer is None:
+            try:
+                self.zh_word_tokenizer = StanfordSegmenter()
+                self.zh_word_tokenizer.default_config('zh')
+            except LookupError:
+                logger.error("Make sure you download stanford-segmenter (https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip) with the following steps")
+                logger.error("1. wget https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip -O /path/to/stanford-segmenter-2018-10-16.zip")
+                logger.error("2. cd /path/to && unzip stanford-segmenter-2018-10-16.zip")
+                logger.error("3. cd stanford-segmenter-2018-10-16 && cp stanford-segmenter-3.9.2.jar stanford-segmenter.jar")
+                logger.error("4. set env variable STANFORD_SEGMENTER=/path/to/stanford-segmenter-2018-10-16")
+                import sys; sys.exit()
+        return self.zh_word_tokenizer.segment(text)
+
     @property
     def vocab_size(self):
         return len(self.encoder)
@@ -299,7 +317,6 @@ def bpe(self, token):
 
     def _tokenize(self, text, lang='en'):
         """ Tokenize a string. """
-        split_tokens = []
         if self.do_lowercase_and_remove_accent:
             text = lowercase_and_remove_accent(text)
         if lang not in self.lang_with_custom_tokenizer:
@@ -308,10 +325,24 @@ def _tokenize(self, text, lang='en'):
             if lang == 'ro':
                 text = romanian_preprocessing(text)
             text = self.moses_tokenize(text, lang=lang)
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+        elif lang == 'th':
+            text = self.moses_pipeline(text, lang=lang)
+            text = th_word_tokenize(text)
+        elif lang == 'zh':
+            # text = self.zh_tokenize(text)
+            text = ' '.join(jieba.cut(text))
+            text = self.moses_pipeline(text, lang=lang)
+            text = text.split()
+        elif lang == 'ja':
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.ja_tokenize(text)
         else:
-            raise ValueError
+            raise ValueError('It should not reach here')
+
+        split_tokens = []
+        for token in text:
+            split_tokens.extend([t for t in self.bpe(token).split(' ')])
+
         return split_tokens
 
     def _convert_token_to_id(self, token):
diff --git a/requirements.txt b/requirements.txt
index 01dca79d23bdba..2e3f8ace51d40b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,8 @@ regex
 # For XLNet
 sentencepiece
 # For XLM
-sacremoses
\ No newline at end of file
+sacremoses
+pythainlp
+kytea
+nltk
+jieba
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 297972226817f0..e37e948fb4d728 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,11 @@
                       'tqdm',
                       'regex',
                       'sentencepiece',
-                      'sacremoses'],
+                      'sacremoses',
+                      'pythainlp',
+                      'kytea',
+                      'nltk',
+                      'jieba'],
     entry_points={
       'console_scripts': [
         "pytorch_transformers=pytorch_transformers.__main__:main",

From f1b018740c9355f0bcf0093fc993724eaa737445 Mon Sep 17 00:00:00 2001
From: Shijie Wu <wushijie5@gmail.com>
Date: Fri, 23 Aug 2019 20:33:01 -0400
Subject: [PATCH 009/119] Add use_lang_emb to config

---
 pytorch_transformers/modeling_xlm.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 19800da2edf079..10be972ea53a7a 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -114,6 +114,7 @@ def __init__(self,
                  causal=False,
                  asm=False,
                  n_langs=1,
+                 use_lang_emb=True,
                  max_position_embeddings=512,
                  embed_init_std=2048 ** -0.5,
                  layer_norm_eps=1e-12,
@@ -157,6 +158,7 @@ def __init__(self,
             self.causal = causal
             self.asm = asm
             self.n_langs = n_langs
+            self.use_lang_emb = use_lang_emb
             self.layer_norm_eps = layer_norm_eps
             self.bos_index = bos_index
             self.eos_index = eos_index
@@ -488,7 +490,7 @@ class XLMModel(XLMPreTrainedModel):
 
     """
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
-                  'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
+                  'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', 
                   'hidden_dim', 'dropout', 'attention_dropout', 'asm',
                   'asm_cutoffs', 'asm_div_value']
 
@@ -507,6 +509,7 @@ def __init__(self, config):  #, dico, is_encoder, with_output):
 
         # dictionary / languages
         self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
         self.n_words = config.n_words
         self.eos_index = config.eos_index
         self.pad_index = config.pad_index
@@ -529,7 +532,7 @@ def __init__(self, config):  #, dico, is_encoder, with_output):
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
         if config.sinusoidal_embeddings:
             create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
-        if config.n_langs > 1:
+        if config.n_langs > 1 and config.use_lang_emb:
             self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
         self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
         self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
@@ -628,7 +631,7 @@ def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
         # embeddings
         tensor = self.embeddings(input_ids)
         tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
-        if langs is not None:
+        if langs is not None and self.use_lang_emb:
             tensor = tensor + self.lang_embeddings(langs)
         if token_type_ids is not None:
             tensor = tensor + self.embeddings(token_type_ids)

From cb60ce59dddb37d70f3e3e7f7dd5b7aa9553aafe Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 15:44:30 -0400
Subject: [PATCH 010/119] Added multiple AutoModel classes:
 AutoModelWithLMHead, AutoModelForQuestionAnswering and
 AutoModelForSequenceClassification

---
 pytorch_transformers/__init__.py      |   3 +-
 pytorch_transformers/modeling_auto.py | 359 ++++++++++++++++++++++++--
 2 files changed, 344 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 62e3b8c47b8018..424e22eb523e49 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -10,7 +10,8 @@
 
 from .tokenization_utils import (PreTrainedTokenizer)
 
-from .modeling_auto import (AutoConfig, AutoModel)
+from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                            AutoModelWithLMHead)
 
 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                             BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 516107c40bbec4..9171438f97c56e 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -23,13 +23,13 @@
 from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn.parameter import Parameter
 
-from .modeling_bert import BertConfig, BertModel
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
-from .modeling_gpt2 import GPT2Config, GPT2Model
-from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
-from .modeling_xlnet import XLNetConfig, XLNetModel
-from .modeling_xlm import XLMConfig, XLMModel
-from .modeling_roberta import RobertaConfig, RobertaModel
+from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -137,20 +137,20 @@ class AutoModel(object):
         when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
 
-        The `from_pretrained()` method take care of returning the correct model class instance
+        The `from_pretrained()` method takes care of returning the correct model class instance
         using pattern matching on the `pretrained_model_name_or_path` string.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
             - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
             - contains `xlnet`: XLNetModel (XLNet model)
             - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
 
-        This class cannot be instantiated using `__init__()` (throw an error).
+        This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
         raise EnvironmentError("AutoModel is designed to be instantiated "
@@ -158,18 +158,18 @@ def __init__(self):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiate a one of the base model classes of the library
+        r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
             - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
             - contains `xlnet`: XLNetModel (XLNet model)
             - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
 
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
@@ -186,12 +186,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     checkpoint in a PyTorch model using the provided conversion scripts and loading
                     the PyTorch model afterwards.
             **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
                 Configuration can be automatically loaded when:
                 - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
                 from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                 In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
@@ -200,7 +200,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
             **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
             **kwargs**: (`optional`) dict:
                 Dictionary of key, values to update the configuration object after loading.
                 Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
@@ -243,3 +243,328 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+
+class AutoModelWithLMHead(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForSequenceClassification(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))

From f68339639a554e2a08bc3eb93bea9fffa095033a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 16:02:23 -0400
Subject: [PATCH 011/119] Tests for added AutoModels

---
 .../tests/modeling_auto_test.py               | 42 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
index d0c830abc7826e..09d09b28fc287e 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,7 +21,11 @@
 import pytest
 import logging
 
-from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
+from pytorch_transformers import (AutoConfig, BertConfig,
+                                  AutoModel, BertModel,
+                                  AutoModelWithLMHead, BertForMaskedLM,
+                                  AutoModelForSequenceClassification, BertForSequenceClassification,
+                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -42,6 +46,42 @@ def test_model_from_pretrained(self):
             for value in loading_info.values():
                 self.assertEqual(len(value), 0)
 
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+
 
 if __name__ == "__main__":
     unittest.main()

From 84a3a9689dcd29274c0ab0054bbbbba5fd4342d2 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 16:08:43 -0400
Subject: [PATCH 012/119] Pytorch Hub & AutoModels

---
 hubconf.py | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index f8336207802f3b..0326d62b25aac5 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,30 +1,5 @@
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
 
-from hubconfs.bert_hubconf import (
-    bertTokenizer,
-    bertModel,
-    bertForNextSentencePrediction,
-    bertForPreTraining,
-    bertForMaskedLM,
-    bertForSequenceClassification,
-    bertForMultipleChoice,
-    bertForQuestionAnswering,
-    bertForTokenClassification
-)
-from hubconfs.gpt_hubconf import (
-    openAIGPTTokenizer,
-    openAIGPTModel,
-    openAIGPTLMHeadModel,
-    openAIGPTDoubleHeadsModel
-)
-from hubconfs.gpt2_hubconf import (
-    gpt2Tokenizer,
-    gpt2Model,
-    gpt2LMHeadModel,
-    gpt2DoubleHeadsModel
-)
-from hubconfs.transformer_xl_hubconf import (
-    transformerXLTokenizer,
-    transformerXLModel,
-    transformerXLLMHeadModel
+from pytorch_transformers import (
+    AutoTokenizer, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )

From a175a9dc0188a367400c2121391fa3abf536748e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 27 Aug 2019 14:05:59 +0200
Subject: [PATCH 013/119] add kwargs to base encode function

---
 pytorch_transformers/tokenization_utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 4fef0e34fb0754..1d05441593562b 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -563,7 +563,7 @@ def _convert_token_to_id_with_added_voc(self, token):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self, text, text_pair=None, add_special_tokens=False):
+    def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         
@@ -574,15 +574,16 @@ def encode(self, text, text_pair=None, add_special_tokens=False):
             text_pair: Optional second sequence to be encoded.
             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
+            **kwargs: passed to the `self.tokenize()` method
         """
         if text_pair is None:
             if add_special_tokens:
-                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
             else:
-                return self.convert_tokens_to_ids(self.tokenize(text))
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
 
-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
 
         if add_special_tokens:
             return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)

From 447afe9cdff782e3bdf15049cc1a6d91d75cff0e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 27 Aug 2019 14:42:03 +0200
Subject: [PATCH 014/119] updating docstring for AutoModel

---
 pytorch_transformers/modeling_auto.py     | 332 +++++++++++-----------
 pytorch_transformers/modeling_utils.py    |  11 +
 pytorch_transformers/tokenization_auto.py |  28 +-
 3 files changed, 203 insertions(+), 168 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 9171438f97c56e..73d9c8e0d41ae9 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -31,7 +31,7 @@
 from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
 from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 
-from .modeling_utils import PreTrainedModel, SequenceSummary
+from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
@@ -76,26 +76,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             - contains `roberta`: RobertaConfig (RoBERTa model)
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **return_unused_kwargs**: (`optional`) bool:
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
                 - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
 
         Examples::
 
@@ -161,7 +167,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
-        The base model class to instantiate is selected as the first pattern matching
+        The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
@@ -175,44 +181,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -294,44 +302,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -406,44 +416,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -509,44 +521,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 468d240fbc86d4..0d4fce67f0c3c3 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -59,6 +59,12 @@ def docstring_decorator(fn):
             fn.__doc__ = ''.join(docstr) + fn.__doc__
             return fn
         return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
 else:
     # Not possible to update class docstrings on python2
     def add_start_docstrings(*docstr):
@@ -66,6 +72,11 @@ def docstring_decorator(fn):
             return fn
         return docstring_decorator
 
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
 
 class PretrainedConfig(object):
     r""" Base class for all configuration classes.
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index b4b63369529e78..576dee70ece4dc 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -69,15 +69,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             - contains `roberta`: RobertaTokenizer (XLM model)
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
 
         Examples::
 

From ca4baf8ca1f09e379c5e396c3332ff570f4422fc Mon Sep 17 00:00:00 2001
From: Shijie Wu <wushijie5@gmail.com>
Date: Tue, 27 Aug 2019 20:03:18 -0400
Subject: [PATCH 015/119] Match order of casing in OSS XLM; Improve document;
 Clean up dependency

---
 pytorch_transformers/tokenization_xlm.py | 102 +++++++++++++++--------
 requirements.txt                         |   6 +-
 setup.py                                 |   6 +-
 3 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index a459dea9b91d57..71bf119387328c 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -20,14 +20,11 @@
 import logging
 import os
 import re
+import sys
 import unicodedata
 from io import open
 
-import jieba
-import Mykytea
 import sacremoses as sm
-from nltk.tokenize.stanford_segmenter import StanfordSegmenter
-from pythainlp.tokenize import word_tokenize as th_word_tokenize
 
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
@@ -93,6 +90,7 @@ def lowercase_and_remove_accent(text):
     Lowercase and strips accents from a piece of text based on
     https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
     """
+    text = ' '.join(text)
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     output = []
@@ -101,7 +99,7 @@ def lowercase_and_remove_accent(text):
         if cat == "Mn":
             continue
         output.append(char)
-    return "".join(output).lower()
+    return "".join(output).lower().split(' ')
 
 
 def replace_unicode_punct(text):
@@ -176,13 +174,13 @@ def romanian_preprocessing(text):
 
 class XLMTokenizer(PreTrainedTokenizer):
     """
-    BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
+    BPE tokenizer for XLM
 
-        - lower case all inputs
+        - Moses preprocessing & tokenization for most supported languages
 
-        - uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \
-        `ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
-        fallback to BERT's BasicTokenizer if not.
+        - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
+
+        - (optionally) lower case & normalize all inputs text
 
         - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
         (ex: "__classify__") to a vocabulary.
@@ -244,30 +242,18 @@ def moses_pipeline(self, text, lang):
     def ja_tokenize(self, text):
         if self.ja_word_tokenizer is None:
             try:
+                import Mykytea
                 self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
-            except RuntimeError:
-                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) with the following steps")
+            except:
+                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
                 logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
                 logger.error("2. autoreconf -i")
                 logger.error("3. ./configure --prefix=$HOME/local")
                 logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
                 import sys; sys.exit()
         return list(self.ja_word_tokenizer.getWS(text))
 
-    def zh_tokenize(self, text):
-        if self.zh_word_tokenizer is None:
-            try:
-                self.zh_word_tokenizer = StanfordSegmenter()
-                self.zh_word_tokenizer.default_config('zh')
-            except LookupError:
-                logger.error("Make sure you download stanford-segmenter (https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip) with the following steps")
-                logger.error("1. wget https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip -O /path/to/stanford-segmenter-2018-10-16.zip")
-                logger.error("2. cd /path/to && unzip stanford-segmenter-2018-10-16.zip")
-                logger.error("3. cd stanford-segmenter-2018-10-16 && cp stanford-segmenter-3.9.2.jar stanford-segmenter.jar")
-                logger.error("4. set env variable STANFORD_SEGMENTER=/path/to/stanford-segmenter-2018-10-16")
-                import sys; sys.exit()
-        return self.zh_word_tokenizer.segment(text)
-
     @property
     def vocab_size(self):
         return len(self.encoder)
@@ -315,11 +301,44 @@ def bpe(self, token):
         self.cache[token] = word
         return word
 
-    def _tokenize(self, text, lang='en'):
-        """ Tokenize a string. """
-        if self.do_lowercase_and_remove_accent:
-            text = lowercase_and_remove_accent(text)
-        if lang not in self.lang_with_custom_tokenizer:
+    def _tokenize(self, text, lang='en', bypass_tokenizer=False):
+        """
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
+
+        Details of tokenization:
+        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+        - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
+            - Install with `pip install pythainlp`
+        - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
+            - Install with the following steps:
+            ```
+            git clone git@github.com:neubig/kytea.git && cd kytea
+            autoreconf -i
+            ./configure --prefix=$HOME/local
+            make && make install
+            pip install kytea
+            ```
+        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
+            - Install with `pip install jieba`
+
+        \* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
+        However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
+        Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
+        if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
+        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
+        and set `bypass_tokenizer=True` to bypass the tokenizer.
+
+        Args:
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        if bypass_tokenizer:
+            text = text.split()
+        elif lang not in self.lang_with_custom_tokenizer:
             text = self.moses_pipeline(text, lang=lang)
             # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
             if lang == 'ro':
@@ -327,9 +346,22 @@ def _tokenize(self, text, lang='en'):
             text = self.moses_tokenize(text, lang=lang)
         elif lang == 'th':
             text = self.moses_pipeline(text, lang=lang)
+            try:
+                if 'pythainlp' not in sys.modules:
+                    from pythainlp.tokenize import word_tokenize as th_word_tokenize
+            except:
+                logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
+                logger.error("1. pip install pythainlp")
+                import sys; sys.exit()
             text = th_word_tokenize(text)
         elif lang == 'zh':
-            # text = self.zh_tokenize(text)
+            try:
+                if 'jieba' not in sys.modules:
+                    import jieba
+            except:
+                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
+                logger.error("1. pip install jieba")
+                import sys; sys.exit()
             text = ' '.join(jieba.cut(text))
             text = self.moses_pipeline(text, lang=lang)
             text = text.split()
@@ -339,9 +371,13 @@ def _tokenize(self, text, lang='en'):
         else:
             raise ValueError('It should not reach here')
 
+        if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
+            text = lowercase_and_remove_accent(text)
+
         split_tokens = []
         for token in text:
-            split_tokens.extend([t for t in self.bpe(token).split(' ')])
+            if token:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
 
         return split_tokens
 
diff --git a/requirements.txt b/requirements.txt
index 2e3f8ace51d40b..01dca79d23bdba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,8 +11,4 @@ regex
 # For XLNet
 sentencepiece
 # For XLM
-sacremoses
-pythainlp
-kytea
-nltk
-jieba
\ No newline at end of file
+sacremoses
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e37e948fb4d728..297972226817f0 100644
--- a/setup.py
+++ b/setup.py
@@ -56,11 +56,7 @@
                       'tqdm',
                       'regex',
                       'sentencepiece',
-                      'sacremoses',
-                      'pythainlp',
-                      'kytea',
-                      'nltk',
-                      'jieba'],
+                      'sacremoses'],
     entry_points={
       'console_scripts': [
         "pytorch_transformers=pytorch_transformers.__main__:main",

From e7fba4bef55c8c667c318549860c0826ccd164dd Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 29 Aug 2019 12:14:29 -0400
Subject: [PATCH 016/119] Documentation auto-deploy

---
 .circleci/config.yml | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7f316b0b3a4208..30555731ef01a3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -26,9 +26,27 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
+    deploy_doc:
+        working_directory: ~/pytorch-transformers
+        docker:
+            - image: circleci/python:3.5
+        steps:
+            - add_ssh_keys:
+                  fingerprints:
+                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+            - checkout
+            - run: sudo pip install -r docs/requirements.txt
+            - run: sudo pip install -r requirements.txt
+            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+workflow_filters: &workflow_filters
+    filters:
+        branches:
+            only:
+                - master
 workflows:
-  version: 2
-  build_and_test:
-    jobs:
-      - build_py3
-      - build_py2
\ No newline at end of file
+    version: 2
+    build_and_test:
+        jobs:
+            - build_py3
+            - build_py2
+            - deploy_doc: *workflow_filters
\ No newline at end of file

From 07e21307b6b16350425ea08790c7e20c081a3b3f Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Thu, 29 Aug 2019 13:44:50 -0700
Subject: [PATCH 017/119] fix adding special tokens

---
 pytorch_transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 9d814f704f7546..588ff8ab6696c9 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -676,7 +676,7 @@ def all_special_tokens(self):
         all_toks = []
         set_attr = self.special_tokens_map
         for attr_value in set_attr.values():
-            all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
+            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
         all_toks = list(set(all_toks))
         return all_toks
 

From 6060b2f89b4ba3ad6d2ddb332835a95962c4bf2c Mon Sep 17 00:00:00 2001
From: ziliwang <wzlnot@gmail.com>
Date: Fri, 30 Aug 2019 12:13:47 +0800
Subject: [PATCH 018/119] fix: hard coding for max number

fp16 max number is 65504, the original 1e30 will cause Nan in fp16
---
 pytorch_transformers/modeling_xlnet.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index ca2d63f6b5216a..ebf8c1fd63cbdd 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -418,7 +418,10 @@ def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn
         attn_score = (ac + bd + ef) * self.scale
         if attn_mask is not None:
             # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
-            attn_score = attn_score - 1e30 * attn_mask
+            if attn_mask.dtype == torch.float16:
+                attn_score = attn_score - 65500 * attn_mask
+            else:
+                attn_score = attn_score - 1e30 * attn_mask
 
         # attention probability
         attn_prob = F.softmax(attn_score, dim=1)

From c8731b958330fd8e8f59cb6beeb626218cf8afe2 Mon Sep 17 00:00:00 2001
From: jamin <jamin@ja.min>
Date: Fri, 30 Aug 2019 13:54:00 +0900
Subject: [PATCH 019/119] update apex fp16 implementation

---
 .../lm_finetuning/finetune_on_pregenerated.py | 93 +++++++++++--------
 1 file changed, 54 insertions(+), 39 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 55933068d53fcb..c9fd7b47ba23b4 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -1,22 +1,20 @@
-from argparse import ArgumentParser
-from pathlib import Path
-import os
-import torch
-import logging
 import json
+import logging
 import random
-import numpy as np
+from argparse import ArgumentParser
 from collections import namedtuple
+from pathlib import Path
 from tempfile import TemporaryDirectory
 
+import numpy as np
+import torch
 from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_transformers.modeling_bert import BertForPreTraining
-from pytorch_transformers.tokenization_bert import BertTokenizer
 from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
@@ -72,16 +70,16 @@ def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memo
         if reduce_memory:
             self.temp_dir = TemporaryDirectory()
             self.working_dir = Path(self.temp_dir.name)
-            input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
+            input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap',
                                   mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
-            input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
+            input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap',
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
+            segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap',
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
+            lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap',
                                      shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
             lm_label_ids[:] = -1
-            is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
+            is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap',
                                  shape=(num_samples,), mode='w+', dtype=np.bool)
         else:
             input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
@@ -125,7 +123,8 @@ def main():
     parser = ArgumentParser()
     parser.add_argument('--pregenerated_data', type=Path, required=True)
     parser.add_argument('--output_dir', type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
+    parser.add_argument("--bert_model", type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
     parser.add_argument("--do_lower_case", action="store_true")
     parser.add_argument("--reduce_memory", action="store_true",
@@ -153,14 +152,14 @@ def main():
     parser.add_argument('--loss_scale',
                         type=float, default=0,
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                        "0 (default value): dynamic loss scaling.\n"
-                        "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps", 
-                        default=0, 
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument("--warmup_steps",
+                        default=0,
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon", 
-                        default=1e-8, 
+    parser.add_argument("--adam_epsilon",
+                        default=1e-8,
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -207,7 +206,7 @@ def main():
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
+            args.gradient_accumulation_steps))
 
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
 
@@ -235,8 +234,9 @@ def main():
 
     # Prepare model
     model = BertForPreTraining.from_pretrained(args.bert_model)
-    if args.fp16:
-        model.half()
+    # We don't need to manually call model.half() following Apex's recommend
+    # if args.fp16:
+    #     model.half()
     model.to(device)
     if args.local_rank != -1:
         try:
@@ -257,25 +257,36 @@ def main():
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
 
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
+                                     t_total=num_train_optimization_steps)
+
     if args.fp16:
         try:
-            from apex.optimizers import FP16_Optimizer
-            from apex.optimizers import FusedAdam
+            # from apex.optimizers import FP16_Optimizer
+            # from apex.optimizers import FusedAdam
+            from apex import amp
         except ImportError:
             raise ImportError(
                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
 
-        optimizer = FusedAdam(optimizer_grouped_parameters,
-                              lr=args.learning_rate,
-                              bias_correction=False,
-                              max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-    else:
-        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
+        # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
+        # because it's recommended for typical use by Apex. We can make it configured
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
+
+    # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
+
+    #     optimizer = FusedAdam(optimizer_grouped_parameters,
+    #                           lr=args.learning_rate,
+    #                           bias_correction=False,
+    #                           max_grad_norm=1.0)
+    #     if args.loss_scale == 0:
+    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+    #     else:
+    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+    # else:
+    #     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
 
     global_step = 0
     logging.info("***** Running training *****")
@@ -300,11 +311,14 @@ def main():
                 outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                 loss = outputs[0]
                 if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
+                    loss = loss.mean()  # mean() to average on multi-gpu.
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
                 if args.fp16:
-                    optimizer.backward(loss)
+                    # I depricate FP16_Optimizer's backward func and replace as Apex document
+                    # optimizer.backward(loss)
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
                 else:
                     loss.backward()
                 tr_loss += loss.item()
@@ -322,7 +336,8 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = model.module if hasattr(model,
+                                                'module') else model  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 

From 2fb9a934b4a7ba46f58db9785c50892ef9e6e5c3 Mon Sep 17 00:00:00 2001
From: jamin <jamin@ja.min>
Date: Fri, 30 Aug 2019 14:05:28 +0900
Subject: [PATCH 020/119] re-format

---
 .../lm_finetuning/finetune_on_pregenerated.py | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index c9fd7b47ba23b4..10721c7dcd7344 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -1,20 +1,22 @@
-import json
+from argparse import ArgumentParser
+from pathlib import Path
+import os
+import torch
 import logging
+import json
 import random
-from argparse import ArgumentParser
+import numpy as np
 from collections import namedtuple
-from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import numpy as np
-import torch
 from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_transformers.modeling_bert import BertForPreTraining
-from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
 from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
@@ -70,16 +72,16 @@ def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memo
         if reduce_memory:
             self.temp_dir = TemporaryDirectory()
             self.working_dir = Path(self.temp_dir.name)
-            input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap',
+            input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
                                   mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
-            input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap',
+            input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap',
+            segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap',
+            lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
                                      shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
             lm_label_ids[:] = -1
-            is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap',
+            is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
                                  shape=(num_samples,), mode='w+', dtype=np.bool)
         else:
             input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
@@ -123,8 +125,7 @@ def main():
     parser = ArgumentParser()
     parser.add_argument('--pregenerated_data', type=Path, required=True)
     parser.add_argument('--output_dir', type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
     parser.add_argument("--do_lower_case", action="store_true")
     parser.add_argument("--reduce_memory", action="store_true",
@@ -152,14 +153,14 @@ def main():
     parser.add_argument('--loss_scale',
                         type=float, default=0,
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps",
-                        default=0,
+                        "0 (default value): dynamic loss scaling.\n"
+                        "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument("--warmup_steps", 
+                        default=0, 
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon",
-                        default=1e-8,
+    parser.add_argument("--adam_epsilon", 
+                        default=1e-8, 
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -206,7 +207,7 @@ def main():
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-            args.gradient_accumulation_steps))
+                            args.gradient_accumulation_steps))
 
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
 
@@ -311,7 +312,7 @@ def main():
                 outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                 loss = outputs[0]
                 if n_gpu > 1:
-                    loss = loss.mean()  # mean() to average on multi-gpu.
+                    loss = loss.mean() # mean() to average on multi-gpu.
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
                 if args.fp16:
@@ -336,8 +337,7 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model_to_save = model.module if hasattr(model,
-                                                'module') else model  # Take care of distributed/parallel training
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
         model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 

From 0517e7a1cb4a70bdf32f8d11b56df8d3911d1792 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 11:23:49 +0200
Subject: [PATCH 021/119] Fix GPT2 and RoBERTa tokenizer to beging with a space
 - update Roberta tokenizer

---
 pytorch_transformers/modeling_gpt2.py        |   2 +-
 pytorch_transformers/tokenization_gpt2.py    |  11 +-
 pytorch_transformers/tokenization_roberta.py | 112 +------------------
 3 files changed, 9 insertions(+), 116 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index af8d3ad10f08d5..b63929db1c49a2 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -682,7 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
         tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
-        model.resize_token_embeddings(tokenizer.vocab_size + 1)
+        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings to the new vocabulary size (add a vector at the end)
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
         mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0)  # Batch size 1
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index e67f25ff598fb1..eb56e7303e32ba 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -109,11 +109,11 @@ def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endof
                  bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
 
-        self.encoder = json.load(open(vocab_file))
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
         bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
         bpe_merges = [tuple(merge.split()) for merge in bpe_data]
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
@@ -169,6 +169,7 @@ def bpe(self, token):
 
     def _tokenize(self, text):
         """ Tokenize a string. """
+        text = ' ' + text  # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             if sys.version_info[0] == 2:
@@ -214,4 +215,4 @@ def save_vocabulary(self, save_directory):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        return vocab_file, merge_file
+        return vocab_file, merge_file
\ No newline at end of file
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index edf4717c891fc9..c77ad07934d9bc 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -23,8 +23,7 @@
 import regex as re
 from io import open
 
-from .tokenization_gpt2 import bytes_to_unicode, get_pairs
-from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
 
 try:
     from functools import lru_cache
@@ -63,7 +62,7 @@ def lru_cache():
 }
 
 
-class RobertaTokenizer(PreTrainedTokenizer):
+class RobertaTokenizer(GPT2Tokenizer):
     """
     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
     """
@@ -77,89 +76,6 @@ def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", e
                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                                mask_token=mask_token, **kwargs)
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """ Tokenize a string. """
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token)
-            else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
-
     def add_special_tokens_single_sentence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
@@ -175,27 +91,3 @@ def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         sep = [self._convert_token_to_id(self.sep_token)]
         cls = [self._convert_token_to_id(self.cls_token)]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
-
-        with open(vocab_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
-                    index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
-                index += 1
-
-        return vocab_file, merge_file

From 50e6daf83abec67964e56ad8fce1477df92a7a3c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 11:27:43 +0200
Subject: [PATCH 022/119] fix Roberta tokenizer __init__

---
 pytorch_transformers/tokenization_roberta.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index c77ad07934d9bc..26805d9f4e7817 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -72,7 +72,8 @@ class RobertaTokenizer(GPT2Tokenizer):
 
     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
-        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+        super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
+                                               bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                                mask_token=mask_token, **kwargs)
 

From 306af132d7059a8528345a2aab4f64215c87723b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 11:30:51 +0200
Subject: [PATCH 023/119] update readme to mention add_special_tokens more
 clearly in example

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e57de5842a215..dc51629b4a40ee 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:
     model = model_class.from_pretrained(pretrained_weights)
 
     # Encode text
-    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
+    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
     with torch.no_grad():
         last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
 

From d51f72d5decaf89c94720f41b48478b343663fc5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 11:41:11 +0200
Subject: [PATCH 024/119] adding shortcut to the ids of all the special tokens

---
 pytorch_transformers/modeling_gpt2.py         |  5 +-
 .../tests/tokenization_tests_commons.py       |  4 +-
 pytorch_transformers/tokenization_utils.py    | 56 +++++++++++++++++++
 3 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index b63929db1c49a2..72b7a7df6fc7f5 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -679,13 +679,16 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     Examples::
 
+        import torch
+        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
         tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
         model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings to the new vocabulary size (add a vector at the end)
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0)  # Batch size 1
+        mc_token_ids = torch.tensor([input_ids.size(-1)])  # Batch size 1
         outputs = model(input_ids, mc_token_ids)
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index ebcf6f48d87c9a..8d94355beab15e 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -128,8 +128,8 @@ def test_add_tokens_tokenizer(self):
             self.assertGreater(tokens[0], tokens[1])
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokens[-3])
-            self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
-            self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+            self.assertEqual(tokens[0], tokenizer.eos_token_id)
+            self.assertEqual(tokens[-2], tokenizer.eos_token_id)
 
 
         def test_required_methods_tokenizer(self):
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index d2855e0922c6e8..1116735d0ec21f 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -155,6 +155,62 @@ def mask_token(self, value):
     def additional_special_tokens(self, value):
         self._additional_special_tokens = value
 
+    @property
+    def bos_token_id(self):
+        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
+        if self._bos_token is None:
+            logger.error("Using bos_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._bos_token)
+
+    @property
+    def eos_token_id(self):
+        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
+        if self._eos_token is None:
+            logger.error("Using eos_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._eos_token)
+
+    @property
+    def unk_token_is(self):
+        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
+        if self._unk_token is None:
+            logger.error("Using unk_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._unk_token)
+
+    @property
+    def sep_token_id(self):
+        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+        if self._sep_token is None:
+            logger.error("Using sep_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._sep_token)
+
+    @property
+    def pad_token_id(self):
+        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        if self._pad_token is None:
+            logger.error("Using pad_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._pad_token)
+
+    @property
+    def cls_token_id(self):
+        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+        if self._cls_token is None:
+            logger.error("Using cls_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._cls_token)
+
+    @property
+    def mask_token_id(self):
+        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+        if self._mask_token is None:
+            logger.error("Using mask_token, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._mask_token)
+
+    @property
+    def additional_special_tokens_ids(self):
+        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
+        if self._additional_special_tokens is None:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+        return self.convert_tokens_to_ids(self._additional_special_tokens)
+
     def __init__(self, max_len=None, **kwargs):
         self._bos_token = None
         self._eos_token = None

From 0f5a7994568289894e949eafc5831783be87174c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 11:49:23 +0200
Subject: [PATCH 025/119] fix GPT2DoubleHeadModel docstring

---
 pytorch_transformers/modeling_gpt2.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 72b7a7df6fc7f5..b7872a36d2793d 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -568,8 +568,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
     Examples::
 
+        import torch
+        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2LMHeadModel.from_pretrained('gpt2')
+
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=input_ids)
         loss, logits = outputs[:2]
@@ -684,12 +688,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings to the new vocabulary size (add a vector at the end)
+        
+        # Add a [CLS] to the vocabulary (we should train it also!)
+        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([input_ids.size(-1)])  # Batch size 1
-        outputs = model(input_ids, mc_token_ids)
+        encoded_choices = [tokenizer.encode(s) for s in choices]
+        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        outputs = model(input_ids, mc_token_ids=mc_token_ids)
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """

From abe734ca1fe8a618ffe5af61e029c18d4c1f0d8b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 12:20:18 +0200
Subject: [PATCH 026/119] fix GPT-2 and RoBERTa tests to be clean now

---
 .../tests/tokenization_gpt2_test.py             | 17 +++++++++--------
 .../tests/tokenization_roberta_test.py          | 17 +++++++++--------
 .../tests/tokenization_tests_commons.py         |  4 ++--
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index da7028c27d70ff..8ba3be7e5dbbd1 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -31,17 +31,18 @@ def setUp(self):
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "lo", "low", "er",
-                 "low", "lowest", "newer", "wider", "<unk>"]
+                 "\u0120", "\u0120l", "\u0120n",
+                 "\u0120lo", "\u0120low", "er",
+                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-        with open(self.vocab_file, "w") as fp:
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
     def get_tokenizer(self):
@@ -49,18 +50,18 @@ def get_tokenizer(self):
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
-        output_text = u"lower<unk>newer"
+        output_text = u" lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower"
-        bpe_tokens = ["low", "er"]
+        bpe_tokens = ["\u0120low", "er"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [13, 12, 17]
+        input_bpe_tokens = [14, 15, 19]
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index a8f940ae432c00..960a91a5e110d0 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -30,17 +30,18 @@ def setUp(self):
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
-                 "lo", "low", "er",
-                 "low", "lowest", "newer", "wider", "<unk>"]
+                 "\u0120", "\u0120l", "\u0120n",
+                 "\u0120lo", "\u0120low", "er",
+                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
         self.special_tokens_map = {"unk_token": "<unk>"}
 
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-        with open(self.vocab_file, "w") as fp:
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
     def get_tokenizer(self):
@@ -48,18 +49,18 @@ def get_tokenizer(self):
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
-        output_text = u"lower<unk>newer"
+        output_text = u" lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower"
-        bpe_tokens = ["low", "er"]
+        bpe_tokens = ["\u0120low", "er"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [13, 12, 17]
+        input_bpe_tokens = [14, 15, 19]
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 8d94355beab15e..11db2923138c3b 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -111,7 +111,7 @@ def test_add_tokens_tokenizer(self):
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
 
             new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                        'pad_token': "<<<<<|||>|>>>>|>"}
+                          'pad_token': "<<<<<|||>|>>>>|>"}
             added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
             vocab_size_3 = tokenizer.vocab_size
             all_size_3 = len(tokenizer)
@@ -129,7 +129,7 @@ def test_add_tokens_tokenizer(self):
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokens[-3])
             self.assertEqual(tokens[0], tokenizer.eos_token_id)
-            self.assertEqual(tokens[-2], tokenizer.eos_token_id)
+            self.assertEqual(tokens[-2], tokenizer.pad_token_id)
 
 
         def test_required_methods_tokenizer(self):

From fd10d79b55d159d845a30adb238cd7019965aa23 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 12:23:12 +0200
Subject: [PATCH 027/119] update GPT2 docstring

---
 pytorch_transformers/tokenization_gpt2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index eb56e7303e32ba..1fa7cbd06bc6ed 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -99,7 +99,10 @@ def get_pairs(word):
 class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level BPE
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

From 4e6a3172cecef53f790f1c995c7569ca11e04444 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 12:23:37 +0200
Subject: [PATCH 028/119] update roberta docstring as well

---
 pytorch_transformers/tokenization_roberta.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 26805d9f4e7817..f290168c952f80 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -64,7 +64,11 @@ def lru_cache():
 
 class RobertaTokenizer(GPT2Tokenizer):
     """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

From ca1a00a302c6aff525d949d398ee6bfe42e3e194 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 12:29:31 +0200
Subject: [PATCH 029/119] fix for python2

---
 pytorch_transformers/tests/tokenization_gpt2_test.py    | 5 +++--
 pytorch_transformers/tests/tokenization_roberta_test.py | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 8ba3be7e5dbbd1..3e4fb5bc1d0fe7 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,6 +17,7 @@
 import os
 import unittest
 import json
+from io import open
 
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
@@ -55,8 +56,8 @@ def get_input_output_texts(self):
 
     def test_full_tokenizer(self):
         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower"
-        bpe_tokens = ["\u0120low", "er"]
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120newer"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index 960a91a5e110d0..e2082e7613353a 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -17,6 +17,7 @@
 import os
 import json
 import unittest
+from io import open
 
 from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases

From 5dd7b677adbd2a228328e42b79583143c16b8dff Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 12:43:08 +0200
Subject: [PATCH 030/119] clean up all byte-level bpe tests

---
 pytorch_transformers/tests/tokenization_gpt2_test.py    | 4 ++--
 pytorch_transformers/tests/tokenization_roberta_test.py | 6 +++---
 pytorch_transformers/tokenization_gpt2.py               | 9 +++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 3e4fb5bc1d0fe7..fbeaa2a6dfc04a 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -57,12 +57,12 @@ def get_input_output_texts(self):
     def test_full_tokenizer(self):
         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower newer"
-        bpe_tokens = ["\u0120low", "er", "\u0120newer"]
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 19]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index e2082e7613353a..e1e7f45efd17ba 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -55,13 +55,13 @@ def get_input_output_texts(self):
 
     def test_full_tokenizer(self):
         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower"
-        bpe_tokens = ["\u0120low", "er"]
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 19]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 1fa7cbd06bc6ed..8a9ade87e10554 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -64,13 +64,14 @@ def lru_cache():
 @lru_cache()
 def bytes_to_unicode():
     """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    Returns list of utf-8 byte and a mapping to unicode strings.
+    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
+    
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
     This is a signficant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
     _chr = unichr if sys.version_info[0] == 2 else chr
     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
@@ -176,9 +177,9 @@ def _tokenize(self, text):
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+                token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
             else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 

From ce5ef4b35d5c81813224424dbb71ec8889155bcd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 13:22:43 +0200
Subject: [PATCH 031/119] python2 doesn't spark joy

---
 pytorch_transformers/tests/tokenization_gpt2_test.py    | 2 +-
 pytorch_transformers/tests/tokenization_roberta_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index fbeaa2a6dfc04a..c7e24e8364b46a 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -42,7 +42,7 @@ def setUp(self):
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens))
+            fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index e1e7f45efd17ba..45e45d645cee92 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -41,7 +41,7 @@ def setUp(self):
         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
         with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens))
+            fp.write(json.dumps(vocab_tokens) + "\n")
         with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 

From 8faf2e086b4ebefda03feee98f0d5a65238658af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 13:36:22 +0200
Subject: [PATCH 032/119] more doc on special tokens

---
 pytorch_transformers/tokenization_utils.py | 23 ++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 1116735d0ec21f..7c66588cc7742b 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -43,21 +43,21 @@ class PreTrainedTokenizer(object):
 
     Parameters:
 
-        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
 
-        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
 
-        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
 
-        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
 
-        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
 
-        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
 
-        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
 
-        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
@@ -494,6 +494,13 @@ def add_special_tokens(self, special_tokens_dict):
         to class attributes. If special tokens are NOT in the vocabulary, they are added
         to it (indexed starting from the last index of the current vocabulary).
 
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - special tokens are carefully handled by the tokenizer (they are never split)
+        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
+
         Args:
             special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
                 [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,

From f8aace6bcd1f72ba962263be3de6876572a366a5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 13:39:52 +0200
Subject: [PATCH 033/119] update tokenizers to use self.XX_token_id instead of
 converting self.XX_token

---
 pytorch_transformers/tokenization_bert.py    | 6 +++---
 pytorch_transformers/tokenization_roberta.py | 6 +++---
 pytorch_transformers/tokenization_xlm.py     | 6 +++---
 pytorch_transformers/tokenization_xlnet.py   | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 04f35aa4662113..434898d1aa1669 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -171,15 +171,15 @@ def add_special_tokens_single_sentence(self, token_ids):
         Adds special tokens to the a sequence for sequence classification tasks.
         A BERT sequence has the following format: [CLS] X [SEP]
         """
-        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
         """
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index f290168c952f80..7c8b3587a13f0d 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -86,13 +86,13 @@ def add_special_tokens_single_sentence(self, token_ids):
         Adds special tokens to a sequence for sequence classification tasks.
         A RoBERTa sequence has the following format: [CLS] X [SEP]
         """
-        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP]
         """
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 2d2f3a8cd4db64..ae9fe6c8288a45 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -220,15 +220,15 @@ def add_special_tokens_single_sentence(self, token_ids):
         Adds special tokens to a sequence for sequence classification tasks.
         An XLM sequence has the following format: [CLS] X [SEP]
         """
-        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
         """
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
     def save_vocabulary(self, save_directory):
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 371b3c9407c407..b479a2832d6b89 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -182,8 +182,8 @@ def add_special_tokens_single_sentence(self, token_ids):
         Adds special tokens to a sequence pair for sequence classification tasks.
         An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
         """
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
         return token_ids + sep + cls
 
     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
@@ -191,8 +191,8 @@ def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         Adds special tokens to a sequence for sequence classification tasks.
         An XLNet sequence has the following format: X [SEP][CLS]
         """
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
     def save_vocabulary(self, save_directory):

From 82462c5cba0ec07a3eeb1e9455d229ceaf43b5f2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 15:30:41 +0200
Subject: [PATCH 034/119] Added option to setup pretrained tokenizer arguments

---
 pytorch_transformers/tokenization_bert.py  |  36 +++---
 pytorch_transformers/tokenization_utils.py |  23 ++--
 pytorch_transformers/tokenization_xlm.py   | 135 +++++++++++++++++++--
 3 files changed, 159 insertions(+), 35 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 04f35aa4662113..d1ace940f060ab 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -63,6 +63,23 @@
     'bert-base-cased-finetuned-mrpc': 512,
 }
 
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-uncased': {'do_lower_case': True},
+    'bert-large-uncased': {'do_lower_case': True},
+    'bert-base-cased': {'do_lower_case': False},
+    'bert-large-cased': {'do_lower_case': False},
+    'bert-base-multilingual-uncased': {'do_lower_case': True},
+    'bert-base-multilingual-cased': {'do_lower_case': False},
+    'bert-base-chinese': {'do_lower_case': False},
+    'bert-base-german-cased': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
+    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
+}
+
+
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
@@ -199,24 +217,6 @@ def save_vocabulary(self, vocab_path):
                 index += 1
         return (vocab_file,)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """ Instantiate a BertTokenizer from pre-trained vocabulary files.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
-            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
-                               "you may want to check this behavior.")
-                kwargs['do_lower_case'] = False
-            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
-                               "but you may want to check this behavior.")
-                kwargs['do_lower_case'] = True
-
-        return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-
 
 class BasicTokenizer(object):
     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 1d05441593562b..19b37da8c8b118 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -40,6 +40,7 @@ class PreTrainedTokenizer(object):
         - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
         - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
         - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
 
     Parameters:
 
@@ -61,6 +62,7 @@ class PreTrainedTokenizer(object):
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
     max_model_input_sizes = {}
 
     SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
@@ -235,10 +237,13 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
+        init_configuration = {}
         if pretrained_model_name_or_path in s3_models:
             # Get the vocabulary from AWS S3 bucket
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
         else:
             # Get the vocabulary from local files
             logger.info(
@@ -312,28 +317,32 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 logger.info("loading file {} from cache at {}".format(
                     file_path, resolved_vocab_files[file_id]))
 
+        # Prepare initialization kwargs
+        init_kwargs = init_configuration
+        init_kwargs.update(kwargs)
+
         # Set max length if needed
         if pretrained_model_name_or_path in cls.max_model_input_sizes:
             # if we're using a pretrained model, ensure the tokenizer
             # wont index sequences longer than the number of positional embeddings
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
             if max_len is not None and isinstance(max_len, (int, float)):
-                kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+                init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
 
-        # Merge resolved_vocab_files arguments in kwargs.
+        # Merge resolved_vocab_files arguments in init_kwargs.
         added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
         special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
         for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in kwargs:
-                kwargs[args_name] = file_path
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
         if special_tokens_map_file is not None:
             special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
             for key, value in special_tokens_map.items():
-                if key not in kwargs:
-                    kwargs[key] = value
+                if key not in init_kwargs:
+                    init_kwargs[key] = value
 
         # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **kwargs)
+        tokenizer = cls(*inputs, **init_kwargs)
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 71bf119387328c..c40d4cd16e7697 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -47,7 +47,9 @@
         'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
         'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
         'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
-    },
+        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+    }
     'merges_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
@@ -58,6 +60,8 @@
         'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
         'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
         'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
     },
 }
 
@@ -70,6 +74,101 @@
     'xlm-mlm-xnli15-1024': 512,
     'xlm-clm-enfr-1024': 512,
     'xlm-clm-ende-1024': 512,
+    'xlm-mlm-17-1280': 512,
+    'xlm-mlm-100-1280': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
+    'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
+                            "id2lang": { "0": "de",
+                                        "1": "en"},
+                           "lang2id": { "de": 0,
+                                        "en": 1 }},
+    'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "fr"},
+                           "lang2id": { "en": 0,
+                                        "fr": 1 }},
+    'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "ro"},
+                           "lang2id": { "en": 0,
+                                        "ro": 1 }},
+    'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
+                                 "id2lang": {   "0": "ar",
+                                                "1": "bg",
+                                                "2": "de",
+                                                "3": "el",
+                                                "4": "en",
+                                                "5": "es",
+                                                "6": "fr",
+                                                "7": "hi",
+                                                "8": "ru",
+                                                "9": "sw",
+                                                "10": "th",
+                                                "11": "tr",
+                                                "12": "ur",
+                                                "13": "vi",
+                                                "14": "zh"},
+                                 "lang2id": {   "ar": 0,
+                                                "bg": 1,
+                                                "de": 2,
+                                                "el": 3,
+                                                "en": 4,
+                                                "es": 5,
+                                                "fr": 6,
+                                                "hi": 7,
+                                                "ru": 8,
+                                                "sw": 9,
+                                                "th": 10,
+                                                "tr": 11,
+                                                "ur": 12,
+                                                "vi": 13,
+                                                "zh": 14 }},
+    'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
+                             "id2lang": {   "0": "ar",
+                                                "1": "bg",
+                                                "2": "de",
+                                                "3": "el",
+                                                "4": "en",
+                                                "5": "es",
+                                                "6": "fr",
+                                                "7": "hi",
+                                                "8": "ru",
+                                                "9": "sw",
+                                                "10": "th",
+                                                "11": "tr",
+                                                "12": "ur",
+                                                "13": "vi",
+                                                "14": "zh"},
+                                 "lang2id": {   "ar": 0,
+                                                "bg": 1,
+                                                "de": 2,
+                                                "el": 3,
+                                                "en": 4,
+                                                "es": 5,
+                                                "fr": 6,
+                                                "hi": 7,
+                                                "ru": 8,
+                                                "sw": 9,
+                                                "th": 10,
+                                                "tr": 11,
+                                                "ur": 12,
+                                                "vi": 13,
+                                                "zh": 14 }},
+    'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "fr"},
+                           "lang2id": { "en": 0,
+                                        "fr": 1 }},
+    'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "de",
+                                        "1": "en"},
+                           "lang2id": { "de": 0,
+                                        "en": 1 }},
+    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False},
+    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False},
 }
 
 def get_pairs(word):
@@ -183,17 +282,26 @@ class XLMTokenizer(PreTrainedTokenizer):
         - (optionally) lower case & normalize all inputs text
 
         - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        (ex: "__classify__") to a vocabulary.
+        (ex: "__classify__") to a vocabulary
+        
+        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
+
+        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
+
+        - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                  sep_token="</s>", pad_token="<pad>", cls_token="</s>",
                  mask_token="<special1>", additional_special_tokens=["<special0>",
                  "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
-                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
+                 "<special6>", "<special7>", "<special8>", "<special9>"],
+                 lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
+                 **kwargs):
         super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
                                            sep_token=sep_token, pad_token=pad_token,
                                            cls_token=cls_token, mask_token=mask_token,
@@ -206,7 +314,12 @@ def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
         self.cache_moses_tokenizer = dict()
         self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
         # True for current supported model (v1.2.0), False for XLM-17 & 100
-        self.do_lowercase_and_remove_accent = True
+        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
         self.ja_word_tokenizer = None
         self.zh_word_tokenizer = None
 
@@ -244,14 +357,14 @@ def ja_tokenize(self, text):
             try:
                 import Mykytea
                 self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
-            except:
+            except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
                 logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
                 logger.error("2. autoreconf -i")
                 logger.error("3. ./configure --prefix=$HOME/local")
                 logger.error("4. make && make install")
                 logger.error("5. pip install kytea")
-                import sys; sys.exit()
+                raise e
         return list(self.ja_word_tokenizer.getWS(text))
 
     @property
@@ -336,6 +449,8 @@ def _tokenize(self, text, lang='en', bypass_tokenizer=False):
         Returns:
             List of tokens.
         """
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
         if bypass_tokenizer:
             text = text.split()
         elif lang not in self.lang_with_custom_tokenizer:
@@ -349,19 +464,19 @@ def _tokenize(self, text, lang='en', bypass_tokenizer=False):
             try:
                 if 'pythainlp' not in sys.modules:
                     from pythainlp.tokenize import word_tokenize as th_word_tokenize
-            except:
+            except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
                 logger.error("1. pip install pythainlp")
-                import sys; sys.exit()
+                raise e
             text = th_word_tokenize(text)
         elif lang == 'zh':
             try:
                 if 'jieba' not in sys.modules:
                     import jieba
-            except:
+            except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
                 logger.error("1. pip install jieba")
-                import sys; sys.exit()
+                raise e
             text = ' '.join(jieba.cut(text))
             text = self.moses_pipeline(text, lang=lang)
             text = text.split()

From 350bb6bffab9655c98fa31d5b3d1f778f73d685a Mon Sep 17 00:00:00 2001
From: Rabeeh KARIMI <rabeeh.karimi@idiap.ch>
Date: Fri, 30 Aug 2019 15:34:28 +0200
Subject: [PATCH 035/119] updated tokenizer loading for addressing
 reproducibility issues

---
 examples/run_glue.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index c221b0eb7a3c55..9c23878884925f 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -448,13 +448,14 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
@@ -463,7 +464,6 @@ def main():
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())

From 39eb31e11e55b0e39528f5a888cb6914a1a0ca70 Mon Sep 17 00:00:00 2001
From: Rabeeh KARIMI <rabeeh.karimi@idiap.ch>
Date: Fri, 30 Aug 2019 15:44:41 +0200
Subject: [PATCH 036/119] remove reloading tokenizer in the training, adding it
 to the evaluation part

---
 examples/run_glue.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 9c23878884925f..b083bb21d20b5b 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -448,7 +448,6 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 

From 20c06fa37d343a9ce38c32a23afdb007e1150238 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 10:06:51 -0400
Subject: [PATCH 037/119] Added DistilBERT to documentation index

---
 docs/source/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1ae722e1e87c77..fd73cbe9efab6b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,6 +11,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
+6. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2

From a600b30cc35465326ac11e2b4d26865ea555d08b Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 10:08:14 -0400
Subject: [PATCH 038/119] Fix index number in documentation

---
 docs/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index fd73cbe9efab6b..89169b09455e14 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,7 +11,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-6. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+7. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2

From e0caab0cf052c86e456bc4b4fdac5788433ed935 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 10:09:17 -0400
Subject: [PATCH 039/119] fix link

---
 docs/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 89169b09455e14..d349e146c9b27b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,7 +11,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+7. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2

From 8678ff8df5cc9997537fb62251ba91e58eefc0ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 16:26:04 +0200
Subject: [PATCH 040/119] adding 17 and 100 xlm models

---
 pytorch_transformers/tokenization_xlm.py | 247 ++++++++++++++++++++++-
 1 file changed, 244 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index c40d4cd16e7697..d14acb39c634c8 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -49,7 +49,7 @@
         'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
         'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
         'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
-    }
+    },
     'merges_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
@@ -167,8 +167,249 @@
                                         "1": "en"},
                            "lang2id": { "de": 0,
                                         "en": 1 }},
-    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False},
-    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False},
+    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
+                        "id2lang": {
+                            "0": "ar",
+                            "1": "de",
+                            "2": "en",
+                            "3": "es",
+                            "4": "fr",
+                            "5": "hi",
+                            "6": "it",
+                            "7": "ja",
+                            "8": "ko",
+                            "9": "nl",
+                            "10": "pl",
+                            "11": "pt",
+                            "12": "ru",
+                            "13": "sv",
+                            "14": "tr",
+                            "15": "vi",
+                            "16": "zh"
+                        },
+                        "lang2id": {
+                            "ar": 0,
+                            "de": 1,
+                            "en": 2,
+                            "es": 3,
+                            "fr": 4,
+                            "hi": 5,
+                            "it": 6,
+                            "ja": 7,
+                            "ko": 8,
+                            "nl": 9,
+                            "pl": 10,
+                            "pt": 11,
+                            "ru": 12,
+                            "sv": 13,
+                            "tr": 14,
+                            "vi": 15,
+                            "zh": 16}},
+    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
+                        "id2lang": {
+                            "0": "af",
+                            "1": "als",
+                            "2": "am",
+                            "3": "an",
+                            "4": "ang",
+                            "5": "ar",
+                            "6": "arz",
+                            "7": "ast",
+                            "8": "az",
+                            "9": "bar",
+                            "10": "be",
+                            "11": "bg",
+                            "12": "bn",
+                            "13": "br",
+                            "14": "bs",
+                            "15": "ca",
+                            "16": "ceb",
+                            "17": "ckb",
+                            "18": "cs",
+                            "19": "cy",
+                            "20": "da",
+                            "21": "de",
+                            "22": "el",
+                            "23": "en",
+                            "24": "eo",
+                            "25": "es",
+                            "26": "et",
+                            "27": "eu",
+                            "28": "fa",
+                            "29": "fi",
+                            "30": "fr",
+                            "31": "fy",
+                            "32": "ga",
+                            "33": "gan",
+                            "34": "gl",
+                            "35": "gu",
+                            "36": "he",
+                            "37": "hi",
+                            "38": "hr",
+                            "39": "hu",
+                            "40": "hy",
+                            "41": "ia",
+                            "42": "id",
+                            "43": "is",
+                            "44": "it",
+                            "45": "ja",
+                            "46": "jv",
+                            "47": "ka",
+                            "48": "kk",
+                            "49": "kn",
+                            "50": "ko",
+                            "51": "ku",
+                            "52": "la",
+                            "53": "lb",
+                            "54": "lt",
+                            "55": "lv",
+                            "56": "mk",
+                            "57": "ml",
+                            "58": "mn",
+                            "59": "mr",
+                            "60": "ms",
+                            "61": "my",
+                            "62": "nds",
+                            "63": "ne",
+                            "64": "nl",
+                            "65": "nn",
+                            "66": "no",
+                            "67": "oc",
+                            "68": "pl",
+                            "69": "pt",
+                            "70": "ro",
+                            "71": "ru",
+                            "72": "scn",
+                            "73": "sco",
+                            "74": "sh",
+                            "75": "si",
+                            "76": "simple",
+                            "77": "sk",
+                            "78": "sl",
+                            "79": "sq",
+                            "80": "sr",
+                            "81": "sv",
+                            "82": "sw",
+                            "83": "ta",
+                            "84": "te",
+                            "85": "th",
+                            "86": "tl",
+                            "87": "tr",
+                            "88": "tt",
+                            "89": "uk",
+                            "90": "ur",
+                            "91": "uz",
+                            "92": "vi",
+                            "93": "war",
+                            "94": "wuu",
+                            "95": "yi",
+                            "96": "zh",
+                            "97": "zh_classical",
+                            "98": "zh_min_nan",
+                            "99": "zh_yue"
+                        },
+                        "lang2id": {
+                            "af": 0,
+                            "als": 1,
+                            "am": 2,
+                            "an": 3,
+                            "ang": 4,
+                            "ar": 5,
+                            "arz": 6,
+                            "ast": 7,
+                            "az": 8,
+                            "bar": 9,
+                            "be": 10,
+                            "bg": 11,
+                            "bn": 12,
+                            "br": 13,
+                            "bs": 14,
+                            "ca": 15,
+                            "ceb": 16,
+                            "ckb": 17,
+                            "cs": 18,
+                            "cy": 19,
+                            "da": 20,
+                            "de": 21,
+                            "el": 22,
+                            "en": 23,
+                            "eo": 24,
+                            "es": 25,
+                            "et": 26,
+                            "eu": 27,
+                            "fa": 28,
+                            "fi": 29,
+                            "fr": 30,
+                            "fy": 31,
+                            "ga": 32,
+                            "gan": 33,
+                            "gl": 34,
+                            "gu": 35,
+                            "he": 36,
+                            "hi": 37,
+                            "hr": 38,
+                            "hu": 39,
+                            "hy": 40,
+                            "ia": 41,
+                            "id": 42,
+                            "is": 43,
+                            "it": 44,
+                            "ja": 45,
+                            "jv": 46,
+                            "ka": 47,
+                            "kk": 48,
+                            "kn": 49,
+                            "ko": 50,
+                            "ku": 51,
+                            "la": 52,
+                            "lb": 53,
+                            "lt": 54,
+                            "lv": 55,
+                            "mk": 56,
+                            "ml": 57,
+                            "mn": 58,
+                            "mr": 59,
+                            "ms": 60,
+                            "my": 61,
+                            "nds": 62,
+                            "ne": 63,
+                            "nl": 64,
+                            "nn": 65,
+                            "no": 66,
+                            "oc": 67,
+                            "pl": 68,
+                            "pt": 69,
+                            "ro": 70,
+                            "ru": 71,
+                            "scn": 72,
+                            "sco": 73,
+                            "sh": 74,
+                            "si": 75,
+                            "simple": 76,
+                            "sk": 77,
+                            "sl": 78,
+                            "sq": 79,
+                            "sr": 80,
+                            "sv": 81,
+                            "sw": 82,
+                            "ta": 83,
+                            "te": 84,
+                            "th": 85,
+                            "tl": 86,
+                            "tr": 87,
+                            "tt": 88,
+                            "uk": 89,
+                            "ur": 90,
+                            "uz": 91,
+                            "vi": 92,
+                            "war": 93,
+                            "wuu": 94,
+                            "yi": 95,
+                            "zh": 96,
+                            "zh_classical": 97,
+                            "zh_min_nan": 98,
+                            "zh_yue": 99
+                        }},
 }
 
 def get_pairs(word):

From 3871b8a10757c1b67b29324e872f5b865e49c86c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 16:28:42 +0200
Subject: [PATCH 041/119] adding xlm 17 and 100 models and config on aws

---
 pytorch_transformers/modeling_xlm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 10be972ea53a7a..d82d45fc27d97e 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -44,6 +44,8 @@
     'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
@@ -54,6 +56,8 @@
     'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
 }
 
 

From 88111de07c40797aaca619be693616c3c4cda4bd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 16:55:48 +0200
Subject: [PATCH 042/119] saving and reloading tokenizer configurations

---
 pytorch_transformers/tokenization_utils.py | 54 ++++++++++++++++++----
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 19b37da8c8b118..51e59fe46c3aa0 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -20,6 +20,7 @@
 import os
 import json
 import six
+import copy
 from io import open
 
 from .file_utils import cached_path
@@ -28,6 +29,7 @@
 
 SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
 ADDED_TOKENS_FILE = 'added_tokens.json'
+TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
 
 class PreTrainedTokenizer(object):
     """ Base class for all tokenizers.
@@ -168,9 +170,15 @@ def __init__(self, max_len=None, **kwargs):
         self._additional_special_tokens = []
 
         self.max_len = max_len if max_len is not None else int(1e12)
+
+        # Added tokens
         self.added_tokens_encoder = {}
         self.added_tokens_decoder = {}
 
+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = {}
+
         for key, value in kwargs.items():
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == 'additional_special_tokens':
@@ -230,7 +238,7 @@ def from_pretrained(cls, *inputs, **kwargs):
 
 
     @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
         cache_dir = kwargs.pop('cache_dir', None)
         force_download = kwargs.pop('force_download', False)
         proxies = kwargs.pop('proxies', None)
@@ -266,15 +274,17 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files
-            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
+                                      'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
+                                      }
 
             # If a path to a file was provided, get the parent directory
             saved_directory = pretrained_model_name_or_path
             if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
                 saved_directory = os.path.dirname(saved_directory)
 
-            for file_id, file_name in all_vocab_files_names.items():
+            for file_id, file_name in additional_files_names.items():
                 full_file_name = os.path.join(saved_directory, file_name)
                 if not os.path.exists(full_file_name):
                     logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
@@ -317,8 +327,18 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 logger.info("loading file {} from cache at {}".format(
                     file_path, resolved_vocab_files[file_id]))
 
-        # Prepare initialization kwargs
-        init_kwargs = init_configuration
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
+        if tokenizer_config_file is not None:
+            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            saved_init_inputs = init_kwargs.pop('init_inputs', [])
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            init_kwargs = init_configuration
+
+        # Update with newly provided kwargs
         init_kwargs.update(kwargs)
 
         # Set max length if needed
@@ -342,7 +362,11 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                     init_kwargs[key] = value
 
         # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **init_kwargs)
+        tokenizer = cls(*init_inputs, **init_kwargs)
+
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        tokenizer.init_inputs = init_inputs
+        tokenizer.init_kwargs = init_kwargs
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
@@ -355,8 +379,13 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
 
     def save_pretrained(self, save_directory):
-        """ Save the tokenizer vocabulary files (with added tokens) and the
-            special-tokens-to-class-attributes-mapping to a directory.
+        """ Save the tokenizer vocabulary files together with:
+                - added tokens,
+                - special-tokens-to-class-attributes-mapping,
+                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+
+            This won't save modifications other than (added tokens and special token mapping) you may have
+            applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
 
             This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
@@ -366,6 +395,13 @@ def save_pretrained(self, save_directory):
 
         special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
         added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+
+        with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
 
         with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))

From 69da972ace6fd574a528ef269ebcee32305d18ff Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 17:09:36 +0200
Subject: [PATCH 043/119] added test and debug tokenizer configuration
 serialization

---
 .../tests/tokenization_bert_test.py               |  4 ++--
 .../tests/tokenization_gpt2_test.py               |  5 +++--
 .../tests/tokenization_openai_test.py             |  4 ++--
 .../tests/tokenization_roberta_test.py            |  5 +++--
 .../tests/tokenization_tests_commons.py           | 15 ++++++++++++---
 .../tests/tokenization_transfo_xl_test.py         |  5 +++--
 .../tests/tokenization_xlm_test.py                |  4 ++--
 .../tests/tokenization_xlnet_test.py              |  4 ++--
 pytorch_transformers/tokenization_utils.py        |  4 +++-
 9 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index db507317a8e5c5..290b3578209e21 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -41,8 +41,8 @@ def setUp(self):
         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizer(self):
-        return BertTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"UNwant\u00E9d,running"
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index da7028c27d70ff..252dbfe6f474f7 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -44,8 +44,9 @@ def setUp(self):
         with open(self.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self):
-        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index bb354f3fb77204..6b86416d2d6b61 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -45,8 +45,8 @@ def setUp(self):
         with open(self.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self):
-        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index a8f940ae432c00..5f9b65a7a308c2 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -43,8 +43,9 @@ def setUp(self):
         with open(self.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self):
-        return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index ebcf6f48d87c9a..779a3ba6c3f943 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -49,14 +49,19 @@ def setUp(self):
         def tearDown(self):
             shutil.rmtree(self.tmpdirname)
 
-        def get_tokenizer(self):
+        def get_tokenizer(self, **kwargs):
             raise NotImplementedError
 
         def get_input_output_texts(self):
             raise NotImplementedError
 
         def test_save_and_load_tokenizer(self):
+            # safety check on max_len default value so we are sure the test works
             tokenizer = self.get_tokenizer()
+            self.assertNotEqual(tokenizer.max_len, 42)
+
+            # Now let's start the test
+            tokenizer = self.get_tokenizer(max_len=42)
 
             before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
@@ -64,8 +69,12 @@ def test_save_and_load_tokenizer(self):
                 tokenizer.save_pretrained(tmpdirname)
                 tokenizer = tokenizer.from_pretrained(tmpdirname)
 
-            after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-            self.assertListEqual(before_tokens, after_tokens)
+                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+                self.assertListEqual(before_tokens, after_tokens)
+
+                self.assertEqual(tokenizer.max_len, 42)
+                tokenizer = tokenizer.from_pretrained(tmpdirname, max_len=43)
+                self.assertEqual(tokenizer.max_len, 43)
 
         def test_pickle_tokenizer(self):
             tokenizer = self.get_tokenizer()
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index fbd06cf47e7ef4..f881cf1d2b4279 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -37,8 +37,9 @@ def setUp(self):
         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizer(self):
-        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
+    def get_tokenizer(self, **kwargs):
+        kwargs['lower_case'] = True
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"<unk> UNwanted , running"
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index ede77a1f9884d1..43f1e0c5dd7396 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -44,8 +44,8 @@ def setUp(self):
         with open(self.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self):
-        return XLMTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 9feab7c0bdf811..c603ce55f9d7f9 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -35,8 +35,8 @@ def setUp(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
         tokenizer.save_pretrained(self.tmpdirname)
 
-    def get_tokenizer(self):
-        return XLNetTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self):
         input_text = u"This is a test"
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 51e59fe46c3aa0..8d7c29b16c0fd8 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -332,7 +332,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
         tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
         if tokenizer_config_file is not None:
             init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
-            saved_init_inputs = init_kwargs.pop('init_inputs', [])
+            saved_init_inputs = init_kwargs.pop('init_inputs', ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
         else:
@@ -399,6 +399,8 @@ def save_pretrained(self, save_directory):
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
         tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
 
         with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(tokenizer_config, ensure_ascii=False))

From 7044ed6b059c7305b0a1ab8576c775829afd9226 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 17:36:11 +0200
Subject: [PATCH 044/119] fix tokenizers serialization

---
 pytorch_transformers/tests/tokenization_dilbert_test.py  | 4 ++--
 pytorch_transformers/tests/tokenization_tests_commons.py | 4 ++--
 pytorch_transformers/tokenization_transfo_xl.py          | 3 ++-
 pytorch_transformers/tokenization_xlnet.py               | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_dilbert_test.py b/pytorch_transformers/tests/tokenization_dilbert_test.py
index 30268db2166973..42f80609981406 100644
--- a/pytorch_transformers/tests/tokenization_dilbert_test.py
+++ b/pytorch_transformers/tests/tokenization_dilbert_test.py
@@ -27,8 +27,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
 
     tokenizer_class = DistilBertTokenizer
 
-    def get_tokenizer(self):
-        return DistilBertTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def test_sequence_builders(self):
         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 779a3ba6c3f943..6578c5c3a56e22 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -67,13 +67,13 @@ def test_save_and_load_tokenizer(self):
 
             with TemporaryDirectory() as tmpdirname:
                 tokenizer.save_pretrained(tmpdirname)
-                tokenizer = tokenizer.from_pretrained(tmpdirname)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
 
                 after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
                 self.assertListEqual(before_tokens, after_tokens)
 
                 self.assertEqual(tokenizer.max_len, 42)
-                tokenizer = tokenizer.from_pretrained(tmpdirname, max_len=43)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
                 self.assertEqual(tokenizer.max_len, 43)
 
         def test_pickle_tokenizer(self):
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index c603ba695c1921..66bc01c1bb0196 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -95,7 +95,8 @@ def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
             # in a library like ours, at all.
             vocab_dict = torch.load(pretrained_vocab_file)
             for key, value in vocab_dict.items():
-                self.__dict__[key] = value
+                if key not in self.__dict__:
+                    self.__dict__[key] = value
 
         if vocab_file is not None:
             self.build_vocab()
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index ac7231bb680ba9..bf9b9dc782f21a 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, max_len=None,
+    def __init__(self, vocab_file,
                  do_lower_case=False, remove_space=True, keep_accents=False,
                  bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
                  pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",

From 803c1cc4eacd38f1b854578d7d717b5e4a1ada47 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 30 Aug 2019 12:01:27 -0400
Subject: [PATCH 045/119] fix relative import bug cf Issue #1140

---
 examples/distillation/scripts/binarized_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index d1c97bd296b635..792a5692e442ae 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -22,7 +22,7 @@
 import numpy as np
 from pytorch_transformers import BertTokenizer
 
-from ..utils import logger
+from examples.distillation.utils import logger
 
 def main():
     parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")

From 282c276e0920b85f0a2b4d348832f4dc83c1d69f Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 30 Aug 2019 12:02:29 -0400
Subject: [PATCH 046/119] typos + file name coherence in distillation README

---
 examples/distillation/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 1b8a4f71780d77..bb919385f171d9 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -68,7 +68,7 @@ python train.py \
 
 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
 
-We highly encourage you to distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
+We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
 
 ```bash
 export NODE_RANK=0
@@ -90,11 +90,11 @@ python -m torch.distributed.launch \
     train.py \
         --force \
         --n_gpu $WORLD_SIZE \
-        --data_file data/dump_concat_wiki_toronto_bk.bert-base-uncased.pickle \
-        --token_counts data/token_counts_concat_wiki_toronto_bk.bert-base-uncased.pickle \
-        --dump_path serialization_dir/with_transform/last_word
+        --data_file data/binarized_text.bert-base-uncased.pickle \
+        --token_counts data/token_counts.bert-base-uncased.pickle \
+        --dump_path serialization_dir/my_first_distillation
 ```
 
-**Tips** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
+**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
 
 Happy distillation!

From dc43215c0176247b9629494e5173bc06d3f68abb Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 15:44:30 -0400
Subject: [PATCH 047/119] Added multiple AutoModel classes:
 AutoModelWithLMHead, AutoModelForQuestionAnswering and
 AutoModelForSequenceClassification

---
 pytorch_transformers/__init__.py      |   3 +-
 pytorch_transformers/modeling_auto.py | 359 ++++++++++++++++++++++++--
 2 files changed, 344 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 47783057d1a9fb..eb356f2165f417 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -11,7 +11,8 @@
 
 from .tokenization_utils import (PreTrainedTokenizer)
 
-from .modeling_auto import (AutoConfig, AutoModel)
+from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                            AutoModelWithLMHead)
 
 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                             BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index cdacb7b552c5bc..9dd8a6666aa5fd 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -23,13 +23,13 @@
 from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn.parameter import Parameter
 
-from .modeling_bert import BertConfig, BertModel
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
-from .modeling_gpt2 import GPT2Config, GPT2Model
-from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
-from .modeling_xlnet import XLNetConfig, XLNetModel
-from .modeling_xlm import XLMConfig, XLMModel
-from .modeling_roberta import RobertaConfig, RobertaModel
+from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 from .modeling_distilbert import DistilBertConfig, DistilBertModel
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
@@ -140,20 +140,20 @@ class AutoModel(object):
         when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
 
-        The `from_pretrained()` method take care of returning the correct model class instance
+        The `from_pretrained()` method takes care of returning the correct model class instance
         using pattern matching on the `pretrained_model_name_or_path` string.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
             - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
             - contains `xlnet`: XLNetModel (XLNet model)
             - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
 
-        This class cannot be instantiated using `__init__()` (throw an error).
+        This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
         raise EnvironmentError("AutoModel is designed to be instantiated "
@@ -161,18 +161,18 @@ def __init__(self):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiate a one of the base model classes of the library
+        r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
             - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
             - contains `xlnet`: XLNetModel (XLNet model)
             - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
 
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
@@ -189,12 +189,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     checkpoint in a PyTorch model using the provided conversion scripts and loading
                     the PyTorch model afterwards.
             **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
                 Configuration can be automatically loaded when:
                 - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
                 from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                 In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
@@ -203,7 +203,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
             **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
             **kwargs**: (`optional`) dict:
                 Dictionary of key, values to update the configuration object after loading.
                 Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
@@ -248,3 +248,328 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+
+class AutoModelWithLMHead(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForSequenceClassification(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaining positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
+            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))

From 25e83894393bc0e2d42913465fef024cd1e9a9e4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 16:02:23 -0400
Subject: [PATCH 048/119] Tests for added AutoModels

---
 .../tests/modeling_auto_test.py               | 42 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
index d0c830abc7826e..09d09b28fc287e 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,7 +21,11 @@
 import pytest
 import logging
 
-from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
+from pytorch_transformers import (AutoConfig, BertConfig,
+                                  AutoModel, BertModel,
+                                  AutoModelWithLMHead, BertForMaskedLM,
+                                  AutoModelForSequenceClassification, BertForSequenceClassification,
+                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -42,6 +46,42 @@ def test_model_from_pretrained(self):
             for value in loading_info.values():
                 self.assertEqual(len(value), 0)
 
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+
 
 if __name__ == "__main__":
     unittest.main()

From 7ca96538529b2814a35f2cf511ea61daf647afee Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 26 Aug 2019 16:08:43 -0400
Subject: [PATCH 049/119] Pytorch Hub & AutoModels

---
 hubconf.py | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index f8336207802f3b..0326d62b25aac5 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,30 +1,5 @@
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
 
-from hubconfs.bert_hubconf import (
-    bertTokenizer,
-    bertModel,
-    bertForNextSentencePrediction,
-    bertForPreTraining,
-    bertForMaskedLM,
-    bertForSequenceClassification,
-    bertForMultipleChoice,
-    bertForQuestionAnswering,
-    bertForTokenClassification
-)
-from hubconfs.gpt_hubconf import (
-    openAIGPTTokenizer,
-    openAIGPTModel,
-    openAIGPTLMHeadModel,
-    openAIGPTDoubleHeadsModel
-)
-from hubconfs.gpt2_hubconf import (
-    gpt2Tokenizer,
-    gpt2Model,
-    gpt2LMHeadModel,
-    gpt2DoubleHeadsModel
-)
-from hubconfs.transformer_xl_hubconf import (
-    transformerXLTokenizer,
-    transformerXLModel,
-    transformerXLLMHeadModel
+from pytorch_transformers import (
+    AutoTokenizer, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )

From f35f61228085cb8012ca360047d157bde6f267c2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 27 Aug 2019 14:42:03 +0200
Subject: [PATCH 050/119] updating docstring for AutoModel

---
 pytorch_transformers/modeling_auto.py     | 332 +++++++++++-----------
 pytorch_transformers/modeling_utils.py    |  11 +
 pytorch_transformers/tokenization_auto.py |  28 +-
 3 files changed, 203 insertions(+), 168 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 9dd8a6666aa5fd..b15a21c646f418 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -32,7 +32,7 @@
 from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 from .modeling_distilbert import DistilBertConfig, DistilBertModel
 
-from .modeling_utils import PreTrainedModel, SequenceSummary
+from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
@@ -77,26 +77,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             - contains `roberta`: RobertaConfig (RoBERTa model)
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **return_unused_kwargs**: (`optional`) bool:
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
                 - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
 
         Examples::
 
@@ -164,7 +170,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
 
-        The base model class to instantiate is selected as the first pattern matching
+        The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
@@ -178,44 +184,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -299,44 +307,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -411,44 +421,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -514,44 +526,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaining positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuration.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by supplying the save directory).
-            **state_dict**: an optional state dictionary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 468d240fbc86d4..0d4fce67f0c3c3 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -59,6 +59,12 @@ def docstring_decorator(fn):
             fn.__doc__ = ''.join(docstr) + fn.__doc__
             return fn
         return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
 else:
     # Not possible to update class docstrings on python2
     def add_start_docstrings(*docstr):
@@ -66,6 +72,11 @@ def docstring_decorator(fn):
             return fn
         return docstring_decorator
 
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
 
 class PretrainedConfig(object):
     r""" Base class for all configuration classes.
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index b4b63369529e78..576dee70ece4dc 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -69,15 +69,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             - contains `roberta`: RobertaTokenizer (XLM model)
 
         Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
 
         Examples::
 

From bc29aa67a95290dd9a1fd0f71755bfe32ab6b176 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 11:13:51 -0400
Subject: [PATCH 051/119] HubConf configuration

---
 hubconf.py                     |  11 +-
 hubconfs/automodels_hubconf.py | 403 +++++++++++++++++++++++++++++++++
 2 files changed, 411 insertions(+), 3 deletions(-)
 create mode 100644 hubconfs/automodels_hubconf.py

diff --git a/hubconf.py b/hubconf.py
index 0326d62b25aac5..d8361c08c69c95 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,5 +1,10 @@
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
 
-from pytorch_transformers import (
-    AutoTokenizer, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
-)
+from hubconfs.automodels_hubconf import (
+    autoConfig,
+    autoModel,
+    autoModelForQuestionAnswering,
+    autoModelForSequenceClassification,
+    autoModelWithLMHead,
+    autoTokenizer,
+)
\ No newline at end of file
diff --git a/hubconfs/automodels_hubconf.py b/hubconfs/automodels_hubconf.py
new file mode 100644
index 00000000000000..2d237c6751e99e
--- /dev/null
+++ b/hubconfs/automodels_hubconf.py
@@ -0,0 +1,403 @@
+from pytorch_transformers import (
+    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+)
+
+
+def autoConfig(*args, **kwargs):
+    r""" Instantiates one of the configuration classes of the library
+            from a pre-trained model configuration.
+
+            The configuration class to instantiate is selected as the first pattern matching
+            in the `pretrained_model_name_or_path` string (in the following order):
+                - contains `bert`: BertConfig (Bert model)
+                - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+                - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+                - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+                - contains `xlnet`: XLNetConfig (XLNet model)
+                - contains `xlm`: XLMConfig (XLM model)
+                - contains `roberta`: RobertaConfig (RoBERTa model)
+
+            Params:
+                pretrained_model_name_or_path: either:
+
+                    - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                    - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                    - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+                cache_dir: (`optional`) string:
+                    Path to a directory in which a downloaded pre-trained model
+                    configuration should be cached if the standard cache should not be used.
+
+                kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                    - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                    - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+                force_download: (`optional`) boolean, default False:
+                    Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+                proxies: (`optional`) dict, default None:
+                    A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                    The proxies are used on each request.
+
+                return_unused_kwargs: (`optional`) bool:
+
+                    - If False, then this function returns just the final configuration object.
+                    - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+            Examples::
+
+                config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+                config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+                config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+                assert config.output_attention == True
+                config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                                   foo=False, return_unused_kwargs=True)
+                assert config.output_attention == True
+                assert unused_kwargs == {'foo': False}
+
+            """
+
+    return AutoConfig.from_pretrained(*args, **kwargs)
+
+
+def autoTokenizer(*args, **kwargs):
+    r""" Instantiates one of the tokenizer classes of the library
+    from a pre-trained model vocabulary.
+
+    The tokenizer class to instantiate is selected as the first pattern matching
+    in the `pretrained_model_name_or_path` string (in the following order):
+        - contains `bert`: BertTokenizer (Bert model)
+        - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+        - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+        - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+        - contains `xlnet`: XLNetTokenizer (XLNet model)
+        - contains `xlm`: XLMTokenizer (XLM model)
+        - contains `roberta`: RobertaTokenizer (XLM model)
+
+    Params:
+        pretrained_model_name_or_path: either:
+
+            - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+            - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
+
+        inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+        kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+
+    Examples::
+
+        config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+        config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+    """
+
+    return AutoTokenizer.from_pretrained(*args, **kwargs)
+
+
+def autoModel(*args, **kwargs):
+    r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModel.from_pretrained(*args, **kwargs)
+
+
+def autoModelWithLMHead(*args, **kwargs):
+    r""" Instantiates one of the language modeling model classes of the library
+    from a pre-trained model configuration.
+
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    using pattern matching on the `pretrained_model_name_or_path` string.
+
+    The model class to instantiate is selected as the first pattern matching
+    in the `pretrained_model_name_or_path` string (in the following order):
+        - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+        - contains `bert`: BertForMaskedLM (Bert model)
+        - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+        - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+        - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+        - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+        - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+    The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+    To train the model, you should first set it back in training mode with `model.train()`
+
+    Params:
+        pretrained_model_name_or_path: either:
+
+            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+            - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+        model_args: (`optional`) Sequence of positional arguments:
+            All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+        config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+            - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+            - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+            - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+        state_dict: (`optional`) dict:
+            an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+            This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+            In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded pre-trained model
+            configuration should be cached if the standard cache should not be used.
+
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
+
+        output_loading_info: (`optional`) boolean:
+            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+        kwargs: (`optional`) Remaining dictionary of keyword arguments:
+            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+    Examples::
+
+        model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+
+
+def autoModelForSequenceClassification(*args, **kwargs):
+    r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+
+
+def autoModelForQuestionAnswering(*args, **kwargs):
+    r""" Instantiates one of the question answering model classes of the library
+    from a pre-trained model configuration.
+
+    The `from_pretrained()` method takes care of returning the correct model class instance
+    using pattern matching on the `pretrained_model_name_or_path` string.
+
+    The model class to instantiate is selected as the first pattern matching
+    in the `pretrained_model_name_or_path` string (in the following order):
+        - contains `bert`: BertForQuestionAnswering (Bert model)
+        - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+        - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+    The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+    To train the model, you should first set it back in training mode with `model.train()`
+
+    Params:
+        pretrained_model_name_or_path: either:
+
+            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+            - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+        model_args: (`optional`) Sequence of positional arguments:
+            All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+        config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+            - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+            - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+            - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+        state_dict: (`optional`) dict:
+            an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+            This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+            In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded pre-trained model
+            configuration should be cached if the standard cache should not be used.
+
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
+
+        output_loading_info: (`optional`) boolean:
+            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+        kwargs: (`optional`) Remaining dictionary of keyword arguments:
+            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+    Examples::
+
+        model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)

From dec8f4d6fdc106ec63a30ad9ad33526be0675f5a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 13:52:18 -0400
Subject: [PATCH 052/119] Added DistilBERT models to all other AutoModels.

---
 pytorch_transformers/modeling_auto.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index b15a21c646f418..0c328909c2cd19 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -30,12 +30,13 @@
 from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
 from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
 from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertConfig, DistilBertModel
+from .modeling_distilbert import DistilBertConfig, DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
 
 from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
+
 class AutoConfig(object):
     r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
@@ -47,6 +48,7 @@ class method.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
             - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
@@ -68,6 +70,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         The configuration class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
             - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
@@ -151,6 +154,7 @@ class method.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -172,6 +176,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -258,7 +263,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                          "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 
 
-
 class AutoModelWithLMHead(object):
     r"""
         :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
@@ -271,6 +275,7 @@ class method.
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -295,6 +300,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -359,7 +365,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -391,6 +399,7 @@ class method.
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -412,6 +421,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -473,7 +483,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -498,6 +510,7 @@ class method.
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
             - contains `bert`: BertForQuestionAnswering (Bert model)
             - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
             - contains `xlm`: XLMForQuestionAnswering (XLM model)
@@ -518,6 +531,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
             - contains `bert`: BertForQuestionAnswering (Bert model)
             - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
             - contains `xlm`: XLMForQuestionAnswering (XLM model)
@@ -578,7 +592,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
             return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'xlnet' in pretrained_model_name_or_path:
             return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

From 9b6e3b34d9fff2ca5868371c48e4a2da0dd69c33 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 14:09:02 -0400
Subject: [PATCH 053/119] Docstrings

---
 hubconfs/automodels_hubconf.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hubconfs/automodels_hubconf.py b/hubconfs/automodels_hubconf.py
index 2d237c6751e99e..b35073d77ec076 100644
--- a/hubconfs/automodels_hubconf.py
+++ b/hubconfs/automodels_hubconf.py
@@ -113,6 +113,7 @@ def autoModel(*args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -190,6 +191,7 @@ def autoModelWithLMHead(*args, **kwargs):
 
     The model class to instantiate is selected as the first pattern matching
     in the `pretrained_model_name_or_path` string (in the following order):
+        - contains `distilbert`: DistilBertModelForMaskedLM (DistilBERT model)
         - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
         - contains `bert`: BertForMaskedLM (Bert model)
         - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -266,6 +268,7 @@ def autoModelForSequenceClassification(*args, **kwargs):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModelForSequenceClassification (DistilBERT model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -340,6 +343,7 @@ def autoModelForQuestionAnswering(*args, **kwargs):
 
     The model class to instantiate is selected as the first pattern matching
     in the `pretrained_model_name_or_path` string (in the following order):
+        - contains `distilbert`: DistilBertModelForQuestionAnswering (DistilBERT model)
         - contains `bert`: BertForQuestionAnswering (Bert model)
         - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
         - contains `xlm`: XLMForQuestionAnswering (XLM model)

From 7a1f174a9d654814019548f348968b2f14a248f4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:20:44 +0200
Subject: [PATCH 054/119] update names of torch.hub to simpler names - update
 docstring

---
 hubconf.py                     |  14 +-
 hubconfs/automodels_hubconf.py | 409 +++++----------------------------
 2 files changed, 63 insertions(+), 360 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 6e7b6b21eb7ddf..05afd63a46f774 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,10 +1,10 @@
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 
 from hubconfs.automodels_hubconf import (
-    autoConfig,
-    autoModel,
-    autoModelForQuestionAnswering,
-    autoModelForSequenceClassification,
-    autoModelWithLMHead,
-    autoTokenizer,
+    config,
+    model,
+    modelForQuestionAnswering,
+    modelForSequenceClassification,
+    modelWithLMHead,
+    tokenizer,
 )
diff --git a/hubconfs/automodels_hubconf.py b/hubconfs/automodels_hubconf.py
index b35073d77ec076..5c1ab5ebc6e26e 100644
--- a/hubconfs/automodels_hubconf.py
+++ b/hubconfs/automodels_hubconf.py
@@ -1,59 +1,20 @@
 from pytorch_transformers import (
     AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-
-
-def autoConfig(*args, **kwargs):
-    r""" Instantiates one of the configuration classes of the library
-            from a pre-trained model configuration.
-
-            The configuration class to instantiate is selected as the first pattern matching
-            in the `pretrained_model_name_or_path` string (in the following order):
-                - contains `bert`: BertConfig (Bert model)
-                - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-                - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-                - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-                - contains `xlnet`: XLNetConfig (XLNet model)
-                - contains `xlm`: XLMConfig (XLM model)
-                - contains `roberta`: RobertaConfig (RoBERTa model)
-
-            Params:
-                pretrained_model_name_or_path: either:
-
-                    - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                    - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                    - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
-
-                cache_dir: (`optional`) string:
-                    Path to a directory in which a downloaded pre-trained model
-                    configuration should be cached if the standard cache should not be used.
-
-                kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
-
-                    - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                    - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
-
-                force_download: (`optional`) boolean, default False:
-                    Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-                proxies: (`optional`) dict, default None:
-                    A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                    The proxies are used on each request.
-
-                return_unused_kwargs: (`optional`) bool:
-
-                    - If False, then this function returns just the final configuration object.
-                    - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-
-            Examples::
-
-                config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-                config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
-                config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+from pytorch_transformers.modeling_utils import add_start_docstrings
+
+@add_start_docstrings(AutoConfig.__doc__)
+def config(*args, **kwargs):
+    r""" 
+                # Using torch.hub !
+                import torch
+
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                 assert config.output_attention == True
-                config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                                   foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                 assert config.output_attention == True
                 assert unused_kwargs == {'foo': False}
 
@@ -62,346 +23,88 @@ def autoConfig(*args, **kwargs):
     return AutoConfig.from_pretrained(*args, **kwargs)
 
 
-def autoTokenizer(*args, **kwargs):
-    r""" Instantiates one of the tokenizer classes of the library
-    from a pre-trained model vocabulary.
-
-    The tokenizer class to instantiate is selected as the first pattern matching
-    in the `pretrained_model_name_or_path` string (in the following order):
-        - contains `bert`: BertTokenizer (Bert model)
-        - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
-        - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
-        - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
-        - contains `xlnet`: XLNetTokenizer (XLNet model)
-        - contains `xlm`: XLMTokenizer (XLM model)
-        - contains `roberta`: RobertaTokenizer (XLM model)
-
-    Params:
-        pretrained_model_name_or_path: either:
-
-            - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-            - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-            - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
-
-        cache_dir: (`optional`) string:
-            Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
-
-        force_download: (`optional`) boolean, default False:
-            Force to (re-)download the vocabulary files and override the cached versions if they exists.
-
-        proxies: (`optional`) dict, default None:
-            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-            The proxies are used on each request.
-
-        inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
-
-        kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+@add_start_docstrings(AutoTokenizer.__doc__)
+def tokenizer(*args, **kwargs):
+    r""" 
+        # Using torch.hub !
+        import torch
 
-    Examples::
-
-        config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-        config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
     """
 
     return AutoTokenizer.from_pretrained(*args, **kwargs)
 
 
-def autoModel(*args, **kwargs):
-    r""" Instantiates one of the base model classes of the library
-        from a pre-trained model configuration.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertModel (DistilBERT model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
-            - contains `bert`: BertModel (Bert model)
-            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: XLNetModel (XLNet model)
-            - contains `xlm`: XLMModel (XLM model)
-
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+@add_start_docstrings(AutoModel.__doc__)
+def model(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
 
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
     return AutoModel.from_pretrained(*args, **kwargs)
 
+@add_start_docstrings(AutoModelWithLMHead.__doc__)
+def modelWithLMHead(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
 
-def autoModelWithLMHead(*args, **kwargs):
-    r""" Instantiates one of the language modeling model classes of the library
-    from a pre-trained model configuration.
-
-    The `from_pretrained()` method takes care of returning the correct model class instance
-    using pattern matching on the `pretrained_model_name_or_path` string.
-
-    The model class to instantiate is selected as the first pattern matching
-    in the `pretrained_model_name_or_path` string (in the following order):
-        - contains `distilbert`: DistilBertModelForMaskedLM (DistilBERT model)
-        - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
-        - contains `bert`: BertForMaskedLM (Bert model)
-        - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
-        - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
-        - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
-        - contains `xlnet`: XLNetLMHeadModel (XLNet model)
-        - contains `xlm`: XLMWithLMHeadModel (XLM model)
-
-    The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-    To train the model, you should first set it back in training mode with `model.train()`
-
-    Params:
-        pretrained_model_name_or_path: either:
-
-            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-            - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-            - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-        model_args: (`optional`) Sequence of positional arguments:
-            All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-        config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
-            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-            - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-            - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-            - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-        state_dict: (`optional`) dict:
-            an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-            This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-            In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-        cache_dir: (`optional`) string:
-            Path to a directory in which a downloaded pre-trained model
-            configuration should be cached if the standard cache should not be used.
-
-        force_download: (`optional`) boolean, default False:
-            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-        proxies: (`optional`) dict, default None:
-            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-            The proxies are used on each request.
-
-        output_loading_info: (`optional`) boolean:
-            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-        kwargs: (`optional`) Remaining dictionary of keyword arguments:
-            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-    Examples::
-
-        model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
 
 
-def autoModelForSequenceClassification(*args, **kwargs):
-    r""" Instantiates one of the sequence classification model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertModelForSequenceClassification (DistilBERT model)
-            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
-            - contains `bert`: BertForSequenceClassification (Bert model)
-            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
-            - contains `xlm`: XLMForSequenceClassification (XLM model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
+def modelForSequenceClassification(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
 
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
     return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
 
 
-def autoModelForQuestionAnswering(*args, **kwargs):
-    r""" Instantiates one of the question answering model classes of the library
-    from a pre-trained model configuration.
-
-    The `from_pretrained()` method takes care of returning the correct model class instance
-    using pattern matching on the `pretrained_model_name_or_path` string.
-
-    The model class to instantiate is selected as the first pattern matching
-    in the `pretrained_model_name_or_path` string (in the following order):
-        - contains `distilbert`: DistilBertModelForQuestionAnswering (DistilBERT model)
-        - contains `bert`: BertForQuestionAnswering (Bert model)
-        - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
-        - contains `xlm`: XLMForQuestionAnswering (XLM model)
-
-    The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-    To train the model, you should first set it back in training mode with `model.train()`
-
-    Params:
-        pretrained_model_name_or_path: either:
-
-            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-            - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-            - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-        model_args: (`optional`) Sequence of positional arguments:
-            All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-        config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
-            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-            - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-            - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-            - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-        state_dict: (`optional`) dict:
-            an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-            This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-            In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-        cache_dir: (`optional`) string:
-            Path to a directory in which a downloaded pre-trained model
-            configuration should be cached if the standard cache should not be used.
-
-        force_download: (`optional`) boolean, default False:
-            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-        proxies: (`optional`) dict, default None:
-            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-            The proxies are used on each request.
-
-        output_loading_info: (`optional`) boolean:
-            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-        kwargs: (`optional`) Remaining dictionary of keyword arguments:
-            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-    Examples::
+@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
+def modelForQuestionAnswering(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
 
-        model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)

From 455a4c842c9137fdda0548e80b5cbb766643b76c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:20:51 +0200
Subject: [PATCH 055/119] add distilbert tokenizer

---
 pytorch_transformers/tokenization_auto.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index 576dee70ece4dc..447d360ca2edae 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -25,6 +25,7 @@
 from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
+from.tokenization_distilbert import DistilBertTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -39,13 +40,14 @@ class method.
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (RoBERTa model)
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
@@ -60,13 +62,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (XLM model)
 
         Params:
             pretrained_model_name_or_path: either:
@@ -95,6 +98,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
         """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         if 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:

From 80aa87d9a3faf9a523073471a0d56a7e0e2c29ce Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:24:23 +0200
Subject: [PATCH 056/119] fix distilbert tokenizer

---
 pytorch_transformers/tokenization_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index 447d360ca2edae..357d184e58f82e 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -25,7 +25,7 @@
 from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
-from.tokenization_distilbert import DistilBertTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
 
 logger = logging.getLogger(__name__)
 

From 256086bc6908448fc6aff9b1e19d95c4f6019bee Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:34:23 +0200
Subject: [PATCH 057/119] clean up and simplify hubconf

---
 hubconf.py                            | 118 ++++++++-
 hubconfs/automodels_hubconf.py        | 110 --------
 hubconfs/bert_hubconf.py              | 360 --------------------------
 hubconfs/gpt2_hubconf.py              | 168 ------------
 hubconfs/gpt_hubconf.py               | 186 -------------
 hubconfs/transformer_xl_hubconf.py    | 130 ----------
 hubconfs/xlm_hubconf.py               | 167 ------------
 hubconfs/xlnet_hubconf.1.py           | 169 ------------
 pytorch_transformers/modeling_auto.py |   5 -
 9 files changed, 110 insertions(+), 1303 deletions(-)
 delete mode 100644 hubconfs/automodels_hubconf.py
 delete mode 100644 hubconfs/bert_hubconf.py
 delete mode 100644 hubconfs/gpt2_hubconf.py
 delete mode 100644 hubconfs/gpt_hubconf.py
 delete mode 100644 hubconfs/transformer_xl_hubconf.py
 delete mode 100644 hubconfs/xlm_hubconf.py
 delete mode 100644 hubconfs/xlnet_hubconf.1.py

diff --git a/hubconf.py b/hubconf.py
index 05afd63a46f774..35e7f1eea83255 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,10 +1,112 @@
+from pytorch_transformers import (
+    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+)
+from pytorch_transformers.modeling_utils import add_start_docstrings
+
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 
-from hubconfs.automodels_hubconf import (
-    config,
-    model,
-    modelForQuestionAnswering,
-    modelForSequenceClassification,
-    modelWithLMHead,
-    tokenizer,
-)
+@add_start_docstrings(AutoConfig.__doc__)
+def config(*args, **kwargs):
+    r""" 
+                # Using torch.hub !
+                import torch
+
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                assert config.output_attention == True
+                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                assert config.output_attention == True
+                assert unused_kwargs == {'foo': False}
+
+            """
+
+    return AutoConfig.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoTokenizer.__doc__)
+def tokenizer(*args, **kwargs):
+    r""" 
+        # Using torch.hub !
+        import torch
+
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+    """
+
+    return AutoTokenizer.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModel.__doc__)
+def model(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModel.from_pretrained(*args, **kwargs)
+
+@add_start_docstrings(AutoModelWithLMHead.__doc__)
+def modelWithLMHead(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
+def modelForSequenceClassification(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
+def modelForQuestionAnswering(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/automodels_hubconf.py b/hubconfs/automodels_hubconf.py
deleted file mode 100644
index 5c1ab5ebc6e26e..00000000000000
--- a/hubconfs/automodels_hubconf.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from pytorch_transformers import (
-    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
-)
-from pytorch_transformers.modeling_utils import add_start_docstrings
-
-@add_start_docstrings(AutoConfig.__doc__)
-def config(*args, **kwargs):
-    r""" 
-                # Using torch.hub !
-                import torch
-
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
-                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
-                assert config.output_attention == True
-                assert unused_kwargs == {'foo': False}
-
-            """
-
-    return AutoConfig.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoTokenizer.__doc__)
-def tokenizer(*args, **kwargs):
-    r""" 
-        # Using torch.hub !
-        import torch
-
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
-
-    """
-
-    return AutoTokenizer.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModel.__doc__)
-def model(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModel.from_pretrained(*args, **kwargs)
-
-@add_start_docstrings(AutoModelWithLMHead.__doc__)
-def modelWithLMHead(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
-def modelForSequenceClassification(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
-def modelForQuestionAnswering(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
deleted file mode 100644
index 6e2830617f0796..00000000000000
--- a/hubconfs/bert_hubconf.py
+++ /dev/null
@@ -1,360 +0,0 @@
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.modeling_bert import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-                . `bert-base-german-cased`
-                . `bert-large-uncased-whole-word-masking`
-                . `bert-large-cased-whole-word-masking`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-
-    Example:
-        import torch
-        sentence = 'Hello, World!'
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                encoded_layers, _ = model(tokens_tensor, segments_tensors)
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForNextSentencePrediction
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
-        model.eval()
-        # Predict the next sentence classification logits
-        with torch.no_grad():
-                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForPreTraining
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
-        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        masked_index = 8
-        tokenized_text[masked_index] = '[MASK]'
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForMaskedLM
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
-        model.eval()
-        # Predict all tokens
-        with torch.no_grad():
-                predictions = model(tokens_tensor, segments_tensors)
-        predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        'henson'
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel. Note that the classification head is only initialized
-    and has to be trained.
-
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForSequenceClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the sequence classification logits
-        with torch.no_grad():
-                seq_classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the sequence classification loss
-        labels = torch.tensor([1])
-        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel. Note that the multiple choice head is
-    only initialized and has to be trained.
-
-    Args:
-    num_choices: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
-        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
-        # Load bertForMultipleChoice
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
-        model.eval()
-        # Predict the multiple choice logits
-        with torch.no_grad():
-                multiple_choice_logits = model(tokens_tensor, segments_tensors)
-        # Or get the multiple choice loss
-        labels = torch.tensor([1])
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states. Note that the classification head is only initialized
-    and has to be trained.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForQuestionAnswering
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
-        model.eval()
-        # Predict the start and end positions logits
-        with torch.no_grad():
-                start_logits, end_logits = model(tokens_tensor, segments_tensors)
-        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
-        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
-        # set model.train() before if training this loss
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel. Note that the classification
-    head is only initialized and has to be trained.
-
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForTokenClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the token classification logits
-        with torch.no_grad():
-                classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the token classification loss
-        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
deleted file mode 100644
index 18afad39133137..00000000000000
--- a/hubconfs/gpt2_hubconf.py
+++ /dev/null
@@ -1,168 +0,0 @@
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_transformers.modeling_gpt2 import (
-    GPT2Model,
-    GPT2LMHeadModel,
-    GPT2DoubleHeadsModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt2_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `gpt2`, `gpt2-medium`
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . a TensorFlow checkpoint with trained weights
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific GPT-2 class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def gpt2Tokenizer(*args, **kwargs):
-    """
-    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
-    Peculiarities:
-        - Byte-level BPE
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * gpt2
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2Model(*args, **kwargs):
-    """
-    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-    identical stacked masked self-attention blocks and pre-trained
-    on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2Model
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                hidden_states_1, past = model(tokens_tensor_1)
-                hidden_states_2, past = model(tokens_tensor_2, past=past)
-    """
-    model = GPT2Model.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2LMHeadModel(*args, **kwargs):
-    """
-    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2LMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                predictions_1, past = model(tokens_tensor_1)
-                predictions_2, past = model(tokens_tensor_2, past=past)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2DoubleHeadsModel(*args, **kwargs):
-    """
-    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head and a multiple choice
-    classification head (only initialized, not pre-trained).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load gpt2DoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
-    """
-    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
deleted file mode 100644
index 649075980c2642..00000000000000
--- a/hubconfs/gpt_hubconf.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_transformers.modeling_openai import (
-	OpenAIGPTModel,
-	OpenAIGPTLMHeadModel,
-	OpenAIGPTDoubleHeadsModel
-)
-
-# Dependecies that are not specified in global hubconf.py
-specific_dependencies = ['spacy', 'ftfy']
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt_docstring = """
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `openai-gpt`
-			- a path or url to a pretrained model archive containing:
-				. `openai_gpt_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-			- a path or url to a pretrained model archive containing:
-				. `openai-gpt-config.json` a configuration file for the model
-				. a series of NumPy files containing OpenAI TensorFlow trained weights
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionary (collections.OrderedDict object)
-		        	to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def openAIGPTTokenizer(*args, **kwargs):
-    """
-    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
-	Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * openai-gpt
-    Keyword args:
-	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-					Default: None
-	max_len: An artificial maximum length to truncate tokenized sequences to;
-        	 Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-			 Default: None
-
-    Example:
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-		
-		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
-    """
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTModel(*args, **kwargs):
-    """
-    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states = model(tokens_tensor)
-    """
-    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTLMHeadModel(*args, **kwargs):
-    """
-    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head on top.
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions = model(tokens_tensor)
-
-		# Get the predicted last token
-		predicted_index = torch.argmax(predictions[0, -1, :]).item()
-		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        '.</w>'
-    """
-    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTDoubleHeadsModel(*args, **kwargs):
-    """
-    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head and a multiple choice
-	classification head (only initialized, not pre-trained).
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load openAIGPTDoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
-    """
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
deleted file mode 100644
index 548d4075818cb9..00000000000000
--- a/hubconfs/transformer_xl_hubconf.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_transformers.modeling_transfo_xl import (
-    TransfoXLModel,
-    TransfoXLLMHeadModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-transformer_xl_docstring = """
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
-
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `transfo-xl-wt103`
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific TransformerXL class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def transformerXLTokenizer(*args, **kwargs):
-    """
-    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * transfo-xl-wt103
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        
-        text = "Who was Jim Henson ?"
-        tokenized_text = tokenizer.tokenize(tokenized_text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-    """
-    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                hidden_states_1, mems_1 = model(tokens_tensor_1)
-                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-    """
-    model = TransfoXLModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLLMHeadModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                predictions_1, mems_1 = model(tokens_tensor_1)
-                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        assert predicted_token == 'who'
-    """
-    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
deleted file mode 100644
index e96d9239445466..00000000000000
--- a/hubconfs/xlm_hubconf.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
-from pytorch_transformers.modeling_xlm import (
-    XLMConfig,
-    XLMModel,
-    XLMWithLMHeadModel,
-    XLMForSequenceClassification,
-    XLMForQuestionAnswering
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_start_docstring = """
-    Model class adapted from the XLM Transformer model of
-        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-        Paper: https://arxiv.org/abs/1901.07291
-        Original code: https://github.com/facebookresearch/XLM
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-"""
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_end_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlm-mlm-en-2048`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLM class
-"""
-
-
-def _begin_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-def _end_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlmTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlm-mlm-en-2048
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmModel(*args, **kwargs):
-    """
-        # Load xlmModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLMModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmLMHeadModel(*args, **kwargs):
-    """
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_end_with_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
deleted file mode 100644
index fa7b7ddb9f7fd8..00000000000000
--- a/hubconfs/xlnet_hubconf.1.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.modeling_xlnet import (
-    XLNetConfig,
-    XLNetModel,
-    XLNetLMHeadModel,
-    # XLNetForSequenceClassification
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlnet_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlnet-large-cased`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `xlnet_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLNet class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlnetTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
-    Peculiarities:
-        - require Google sentencepiece (https://github.com/google/sentencepiece)
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlnet-large-cased
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLNetModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetLMHeadModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-    with a tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_append_from_pretrained_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 0c328909c2cd19..05ff5e5b33cf45 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,11 +18,6 @@
 
 import logging
 
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.nn.parameter import Parameter
-
 from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
 from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
 from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel

From bfe93a5a21a77e0a0f9b35132810aab9d0b1f04c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:43:26 +0200
Subject: [PATCH 058/119] fix distilbert in auto tokenizer

---
 pytorch_transformers/tokenization_auto.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index 357d184e58f82e..889774b36c9255 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -94,13 +94,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         Examples::
 
-            config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
         """
         if 'distilbert' in pretrained_model_name_or_path:
             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        if 'roberta' in pretrained_model_name_or_path:
+        elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

From 09363f2a8b9f0020964a056be621de9012094ed0 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 19:48:32 -0400
Subject: [PATCH 059/119] Fix documentation index

---
 docs/source/index.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index d349e146c9b27b..5b451707d60db8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,7 +11,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2

From 574c5b3a72d6b8ae5e232750229710d355787add Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 30 Aug 2019 20:09:24 -0400
Subject: [PATCH 060/119] [RoBERTa] LayerNorm's eps is not a nn.Parameter so
 there's no point setting it on the model

Instead we correctly store it on the config

(regenerating the hosted config files)

cc @lysandrejik
---
 .../convert_roberta_checkpoint_to_pytorch.py               | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index 0a8967426e9598..743013e4c49707 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         intermediate_size=roberta.args.encoder_ffn_embed_dim,
         max_position_embeddings=514,
         type_vocab_size=1,
+        layer_norm_eps=1e-5, # PyTorch default used in fairseq
     )
     if classification_head:
         config.num_labels = roberta.args.num_classes
@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
     model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
     model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
-    model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
 
     for i in range(config.num_hidden_layers):
         # Encoder: start of layer
@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
         self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
-        self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
 
         ### intermediate
         intermediate: BertIntermediate = layer.intermediate
@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         bert_output.dense.bias = roberta_layer.fc2.bias
         bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
         bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
         #### end of layer
     
     if classification_head:
@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
         model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
         model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
-        model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
         model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
         model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     else:
         their_output = roberta.model(input_ids)[0]
     print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
     success = torch.allclose(our_output, their_output, atol=1e-3)
     print(
         "Do both models output the same tensors?",

From 1d438f15b36b8d4f522a543211854f19a8b91cc2 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 30 Aug 2019 20:20:15 -0400
Subject: [PATCH 061/119] [XLNet] Use pytorch's layernorm like in BERT

See #1089

cc @thomwolf @lysandrejik

Also @dhpollack
---
 pytorch_transformers/modeling_xlnet.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index ebf8c1fd63cbdd..cc9c1379a1e976 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -337,20 +337,7 @@ def num_hidden_layers(self):
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
 except (ImportError, AttributeError) as e:
     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-    class XLNetLayerNorm(nn.Module):
-        def __init__(self, d_model, eps=1e-12):
-            """Construct a layernorm module in the TF style (epsilon inside the square root).
-            """
-            super(XLNetLayerNorm, self).__init__()
-            self.weight = nn.Parameter(torch.ones(d_model))
-            self.bias = nn.Parameter(torch.zeros(d_model))
-            self.variance_epsilon = eps
-
-        def forward(self, x):
-            u = x.mean(-1, keepdim=True)
-            s = (x - u).pow(2).mean(-1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-            return self.weight * x + self.bias
+    from torch.nn import LayerNorm as XLNetLayerNorm
 
 class XLNetRelativeAttention(nn.Module):
     def __init__(self, config):

From b65a994f5997480bd5a8e204bd10a2b3005fdc2e Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 30 Aug 2019 20:33:16 -0400
Subject: [PATCH 062/119] [ci] decrease parallelism to increase success prob

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 30555731ef01a3..3251d195cece40 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,7 +5,7 @@ jobs:
         docker:
             - image: circleci/python:3.5
         resource_class: large
-        parallelism: 4
+        parallelism: 1
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
@@ -17,7 +17,7 @@ jobs:
     build_py2:
         working_directory: ~/pytorch-transformers
         resource_class: large
-        parallelism: 4
+        parallelism: 1
         docker:
             - image: circleci/python:2.7
         steps:

From 0fd0b674e672c49e678d6ead97f0119edda751a3 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 30 Aug 2019 20:36:26 -0400
Subject: [PATCH 063/119] [ci] legible output [skip ci]

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3251d195cece40..a35e20d146c947 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -35,8 +35,8 @@ jobs:
                   fingerprints:
                       - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
             - checkout
-            - run: sudo pip install -r docs/requirements.txt
-            - run: sudo pip install -r requirements.txt
+            - run: sudo pip install --progress-bar off -r docs/requirements.txt
+            - run: sudo pip install --progress-bar off -r requirements.txt
             - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 workflow_filters: &workflow_filters
     filters:

From 7f006cdd875799d7d66a8bda964ab90b0f7ed3f4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 30 Aug 2019 23:58:49 -0400
Subject: [PATCH 064/119] Set seed for head_masking test

---
 pytorch_transformers/tests/modeling_common_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 29f41be7de91ef..8b9a2ffd17059d 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -163,6 +163,7 @@ def test_headmasking(self):
             if not self.test_head_masking:
                 return
 
+            torch.manual_seed(42)
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             config.output_attentions = True

From d7a4c3252ed5e630b7fb6e4b4616daddfe574fc5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 00:08:56 -0400
Subject: [PATCH 065/119] Fixed filename

---
 .../{modeling_dilbert_test.py => modeling_distilbert_test.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename pytorch_transformers/tests/{modeling_dilbert_test.py => modeling_distilbert_test.py} (100%)

diff --git a/pytorch_transformers/tests/modeling_dilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py
similarity index 100%
rename from pytorch_transformers/tests/modeling_dilbert_test.py
rename to pytorch_transformers/tests/modeling_distilbert_test.py

From 42e00cf9e1969973a563db2900ed86bbf58dbc71 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 19 Aug 2019 22:43:02 -0400
Subject: [PATCH 066/119] Pruning saved to configuration first try

---
 pytorch_transformers/modeling_bert.py         |  6 ++
 pytorch_transformers/modeling_utils.py        | 10 ++++
 .../tests/modeling_common_test.py             | 56 +++++++++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index f918afff3ea47c..4a68c2b96b6228 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -649,6 +649,12 @@ def __init__(self, config):
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config)
 
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            for layer, heads in pruned_heads:
+                if self.encoder.layer[int(layer)].attention.self.num_attention_heads == config.num_attention_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
         self.apply(self.init_weights)
 
     def _resize_token_embeddings(self, new_num_tokens):
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 0d4fce67f0c3c3..351fbfd0e14a1d 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -104,6 +104,7 @@ def __init__(self, **kwargs):
         self.output_attentions = kwargs.pop('output_attentions', False)
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
         self.torchscript = kwargs.pop('torchscript', False)
+        self.pruned_heads = kwargs.pop('pruned_heads', {})
 
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
@@ -363,6 +364,15 @@ def prune_heads(self, heads_to_prune):
                 heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+
+        for layer, heads in heads_to_prune.items():
+            if str(layer) not in self.config.pruned_heads:
+                self.config.pruned_heads[str(layer)] = heads
+            else:
+                for head in heads:
+                    if head not in self.config.pruned_heads[str(layer)]:
+                        self.config.pruned_heads[str(layer)].append(head)
+
         base_model._prune_heads(heads_to_prune)
 
     def save_pretrained(self, save_directory):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 8b9a2ffd17059d..7ed1eddbfba661 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -219,6 +219,7 @@ def test_head_pruning(self):
                 del inputs_dict["head_mask"]
 
             for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
@@ -237,6 +238,61 @@ def test_head_pruning(self):
                 self.assertEqual(
                     attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
+        def test_head_pruning_save_load_from_pretrained(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config=config)
+                model.eval()
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                -1: [0]}
+                model.prune_heads(heads_to_prune)
+                directory = "pruned_model"
+                if not os.path.exists(directory):
+                    os.makedirs(directory)
+                model.save_pretrained(directory)
+                model = model_class.from_pretrained(directory)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+                self.assertEqual(
+                    attentions[0].shape[-3], 1)
+                self.assertEqual(
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+                shutil.rmtree(directory)
+
+        def test_head_pruning_save_load_from_config_init(self):
+            print(self.test_pruning)
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+                config.output_attentions = True
+                config.output_hidden_states = False
+
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                 -1: [0]}
+                config.pruned_heads = heads_to_prune
+
+                model = model_class(config=config)
+                model.eval()
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+                self.assertEqual(
+                    attentions[0].shape[-3], 1)
+                self.assertEqual(
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From fc1fbae45df552eb4ff5220463cbde11cfa2b71e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 18:57:30 -0400
Subject: [PATCH 067/119] XLM can be pruned

---
 pytorch_transformers/modeling_xlm.py               | 6 ++++++
 pytorch_transformers/tests/modeling_common_test.py | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 035787a97b2c79..cf121eee416cb9 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -559,6 +559,12 @@ def __init__(self, config):  #, dico, is_encoder, with_output):
             self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
 
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
         self.apply(self.init_weights)
 
     def _resize_token_embeddings(self, new_num_tokens):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 7ed1eddbfba661..dbb041ab0542e0 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -269,7 +269,6 @@ def test_head_pruning_save_load_from_pretrained(self):
                 shutil.rmtree(directory)
 
         def test_head_pruning_save_load_from_config_init(self):
-            print(self.test_pruning)
             if not self.test_pruning:
                 return
 

From 719cb3738d442431d246c107899b40441c3dd5ae Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 20:12:06 -0400
Subject: [PATCH 068/119] Pruning for GPT and GPT-2

---
 pytorch_transformers/modeling_gpt2.py           |  6 ++++++
 pytorch_transformers/modeling_openai.py         |  6 ++++++
 .../tests/modeling_common_test.py               | 17 ++++++++++++-----
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 283dc68a6aed51..23cc7f53132ed4 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -453,6 +453,12 @@ def __init__(self, config):
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            for layer, heads in pruned_heads:
+                if self.h[int(layer)].attn.n_head == config.n_head:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
         self.apply(self.init_weights)
 
     def _resize_token_embeddings(self, new_num_tokens):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 690aa7812be16b..c640b7c86c2efc 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -456,6 +456,12 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
 
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            for layer, heads in pruned_heads:
+                if self.h[int(layer)].attn.n_head == config.n_head:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
         self.apply(self.init_weights)
 
     def _resize_token_embeddings(self, new_num_tokens):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index dbb041ab0542e0..c06c5011530ba1 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -213,13 +213,12 @@ def test_head_pruning(self):
             if not self.test_pruning:
                 return
 
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
             for model_class in self.all_model_classes:
                 config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
@@ -244,6 +243,10 @@ def test_head_pruning_save_load_from_pretrained(self):
 
             for model_class in self.all_model_classes:
                 config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
@@ -274,6 +277,10 @@ def test_head_pruning_save_load_from_config_init(self):
 
             for model_class in self.all_model_classes:
                 config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
                 config.output_attentions = True
                 config.output_hidden_states = False
 

From 87747518e94860e730606848e6a8d2ed68ae8a51 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 21:20:39 -0400
Subject: [PATCH 069/119] Blocks deletion from already deleted heads. Necessary
 integration test.

Now raises a warning when a head to be deleted already has been deleted. An integration test verifying the total pipeline (-> from config -> save model -> load model -> additional head pruning) has been added.
---
 pytorch_transformers/modeling_bert.py         |  1 +
 pytorch_transformers/modeling_gpt2.py         |  1 +
 pytorch_transformers/modeling_openai.py       |  1 +
 pytorch_transformers/modeling_utils.py        | 23 ++++--
 pytorch_transformers/modeling_xlm.py          |  1 +
 .../tests/modeling_common_test.py             | 76 ++++++++++++++++---
 6 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 4a68c2b96b6228..5a65e442d02d5c 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -651,6 +651,7 @@ def __init__(self, config):
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
             for layer, heads in pruned_heads:
                 if self.encoder.layer[int(layer)].attention.self.num_attention_heads == config.num_attention_heads:
                     self.prune_heads({int(layer): list(map(int, heads))})
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 23cc7f53132ed4..8aa5347c717f2b 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -455,6 +455,7 @@ def __init__(self, config):
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
             for layer, heads in pruned_heads:
                 if self.h[int(layer)].attn.n_head == config.n_head:
                     self.prune_heads({int(layer): list(map(int, heads))})
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index c640b7c86c2efc..ce3768c6762e6d 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -458,6 +458,7 @@ def __init__(self, config):
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
             for layer, heads in pruned_heads:
                 if self.h[int(layer)].attn.n_head == config.n_head:
                     self.prune_heads({int(layer): list(map(int, heads))})
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 351fbfd0e14a1d..0a47d07fd4e630 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -201,6 +201,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         # Load config
         config = cls.from_json_file(resolved_config_file)
 
+        if hasattr(config, 'pruned_heads'):
+            config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
+
+
         # Update config with kwargs if needed
         to_remove = []
         for key, value in kwargs.items():
@@ -365,15 +369,22 @@ def prune_heads(self, heads_to_prune):
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
 
+        to_be_pruned = {}
+
         for layer, heads in heads_to_prune.items():
-            if str(layer) not in self.config.pruned_heads:
-                self.config.pruned_heads[str(layer)] = heads
+            if int(layer) not in self.config.pruned_heads:
+                self.config.pruned_heads[int(layer)] = heads
+                to_be_pruned[int(layer)] = heads
             else:
                 for head in heads:
-                    if head not in self.config.pruned_heads[str(layer)]:
-                        self.config.pruned_heads[str(layer)].append(head)
-
-        base_model._prune_heads(heads_to_prune)
+                    if head not in self.config.pruned_heads[int(layer)]:
+                        self.config.pruned_heads[int(layer)].append(head)
+                        to_be_pruned[int(layer)].append(head)
+                    else:
+                        logger.warning(f"Tried to remove head {head} of layer {layer} but it was already removed. "
+                                       f"The removed heads are {heads_to_prune}")
+
+        base_model._prune_heads(to_be_pruned)
 
     def save_pretrained(self, save_directory):
         """ Save a model and its configuration file to a directory, so that it
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index cf121eee416cb9..1e0f8d7c77234f 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -561,6 +561,7 @@ def __init__(self, config):  #, dico, is_encoder, with_output):
 
         if hasattr(config, "pruned_heads"):
             pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
             for layer, heads in pruned_heads:
                 if self.attentions[int(layer)].n_heads == config.n_heads:
                     self.prune_heads({int(layer): list(map(int, heads))})
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index c06c5011530ba1..8b1a70fcf3c372 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -262,12 +262,9 @@ def test_head_pruning_save_load_from_pretrained(self):
 
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
-                self.assertEqual(
-                    attentions[0].shape[-3], 1)
-                self.assertEqual(
-                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
-                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
                 shutil.rmtree(directory)
 
@@ -293,12 +290,67 @@ def test_head_pruning_save_load_from_config_init(self):
 
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
-                self.assertEqual(
-                    attentions[0].shape[-3], 1)
-                self.assertEqual(
-                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
-                self.assertEqual(
-                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+        def test_head_pruning_integration(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
+                config.output_attentions = True
+                config.output_hidden_states = False
+
+                heads_to_prune = {0: [0], 1: [1, 2]}
+                config.pruned_heads = heads_to_prune
+
+                model = model_class(config=config)
+                model.eval()
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                directory = "pruned_model"
+
+                if not os.path.exists(directory):
+                    os.makedirs(directory)
+                model.save_pretrained(directory)
+                model = model_class.from_pretrained(directory)
+                shutil.rmtree(directory)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                heads_to_prune = {0: [0], 2: [1, 2]}
+                model.prune_heads(heads_to_prune)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+
 
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 5c2b94c82aa48db997cfaf9dc63dbd520ac45609 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 21:24:48 -0400
Subject: [PATCH 070/119] Changed string so that Circle CI accepts the warning

---
 pytorch_transformers/modeling_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 0a47d07fd4e630..5a89badba64b35 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -381,8 +381,9 @@ def prune_heads(self, heads_to_prune):
                         self.config.pruned_heads[int(layer)].append(head)
                         to_be_pruned[int(layer)].append(head)
                     else:
-                        logger.warning(f"Tried to remove head {head} of layer {layer} but it was already removed. "
-                                       f"The removed heads are {heads_to_prune}")
+                        logger.warning("Tried to remove head " + head +
+                                       " of layer " + layer +
+                                       " but it was already removed. The current removed heads are " + heads_to_prune)
 
         base_model._prune_heads(to_be_pruned)
 

From c85b5db61a8825edda59a0e9f12bc1be08c63cdc Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 21:37:30 -0400
Subject: [PATCH 071/119] Conditional append/init + fixed warning

---
 pytorch_transformers/modeling_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 5a89badba64b35..c69cba49e3e61e 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -379,11 +379,15 @@ def prune_heads(self, heads_to_prune):
                 for head in heads:
                     if head not in self.config.pruned_heads[int(layer)]:
                         self.config.pruned_heads[int(layer)].append(head)
-                        to_be_pruned[int(layer)].append(head)
+
+                        if int(layer) in to_be_pruned:
+                            to_be_pruned[int(layer)].append(head)
+                        else:
+                            to_be_pruned[int(layer)] = [head]
                     else:
-                        logger.warning("Tried to remove head " + head +
-                                       " of layer " + layer +
-                                       " but it was already removed. The current removed heads are " + heads_to_prune)
+                        logger.warning("Tried to remove head " + str(head) +
+                                       " of layer " + str(layer) +
+                                       " but it was already removed. The current removed heads are " + str(heads_to_prune))
 
         base_model._prune_heads(to_be_pruned)
 

From 0cd283522ab46a9c1c50576be4fd309c08974d8e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 27 Aug 2019 15:56:59 -0400
Subject: [PATCH 072/119] Attempt to fix head index

---
 pytorch_transformers/modeling_gpt2.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 8aa5347c717f2b..8b39ad372e3bc0 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -233,12 +233,14 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = []
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
         for head in heads:
+            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -249,6 +251,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
         self.n_head = self.n_head - len(heads)
+        self.pruned_heads.extend(heads)
 
     def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)

From 0c8e823b031d99d06bddff2b88fd4da2d7500117 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 29 Aug 2019 17:20:11 -0400
Subject: [PATCH 073/119] Added patch to remaining models

---
 pytorch_transformers/modeling_bert.py   | 3 +++
 pytorch_transformers/modeling_openai.py | 3 +++
 pytorch_transformers/modeling_xlm.py    | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 5a65e442d02d5c..9aa25edbe3c01d 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -337,12 +337,14 @@ def __init__(self, config):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(config)
         self.output = BertSelfOutput(config)
+        self.pruned_heads = []
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
         for head in heads:
+            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -354,6 +356,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads.extend(heads)
 
     def forward(self, input_tensor, attention_mask, head_mask=None):
         self_outputs = self.self(input_tensor, attention_mask, head_mask)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index ce3768c6762e6d..78e57b0c592ab6 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -249,12 +249,14 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = []
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
         for head in heads:
+            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -265,6 +267,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
         self.n_head = self.n_head - len(heads)
+        self.pruned_heads.extend(heads)
 
     def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 1e0f8d7c77234f..17e39528f82dbd 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -271,6 +271,7 @@ def __init__(self, n_heads, dim, config):
         self.k_lin = nn.Linear(dim, dim)
         self.v_lin = nn.Linear(dim, dim)
         self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = []
 
     def prune_heads(self, heads):
         attention_head_size = self.dim // self.n_heads
@@ -278,6 +279,7 @@ def prune_heads(self, heads):
             return
         mask = torch.ones(self.n_heads, attention_head_size)
         for head in heads:
+            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -289,6 +291,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.n_heads = self.n_heads - len(heads)
         self.dim = attention_head_size * self.n_heads
+        self.pruned_heads.extend(heads)
 
     def forward(self, input, mask, kv=None, cache=None, head_mask=None):
         """

From bdb4409ed8de4d199907c75832398f2c49a564e1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 31 Aug 2019 01:59:07 +0200
Subject: [PATCH 074/119] updated pruning logic with sets - Bert and GPT-2

---
 pytorch_transformers/modeling_bert.py  | 43 +++++++++++---------------
 pytorch_transformers/modeling_gpt2.py  | 25 +++++++--------
 pytorch_transformers/modeling_utils.py | 40 ++++++++++--------------
 3 files changed, 46 insertions(+), 62 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 9aa25edbe3c01d..e2d83460713504 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -337,26 +337,30 @@ def __init__(self, config):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(config)
         self.output = BertSelfOutput(config)
-        self.pruned_heads = []
+        self.pruned_heads = set()
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
         for head in heads:
-            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
+
         # Prune linear layers
         self.self.query = prune_linear_layer(self.self.query, index)
         self.self.key = prune_linear_layer(self.self.key, index)
         self.self.value = prune_linear_layer(self.self.value, index)
         self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-        # Update hyper params
+
+        # Update hyper params and store pruned heads
         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads.extend(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(self, input_tensor, attention_mask, head_mask=None):
         self_outputs = self.self(input_tensor, attention_mask, head_mask)
@@ -534,12 +538,8 @@ class BertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
 
-    def __init__(self, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
-
-    def init_weights(self, module):
-        """ Initialize the weights.
-        """
+    def _init_weights(self, module):
+        """ Initialize the weights """
         if isinstance(module, (nn.Linear, nn.Embedding)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -652,14 +652,7 @@ def __init__(self, config):
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config)
 
-        if hasattr(config, "pruned_heads"):
-            pruned_heads = config.pruned_heads.copy().items()
-            config.pruned_heads = {}
-            for layer, heads in pruned_heads:
-                if self.encoder.layer[int(layer)].attention.self.num_attention_heads == config.num_attention_heads:
-                    self.prune_heads({int(layer): list(map(int, heads))})
-
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         old_embeddings = self.embeddings.word_embeddings
@@ -768,7 +761,7 @@ def __init__(self, config):
         self.bert = BertModel(config)
         self.cls = BertPreTrainingHeads(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -836,7 +829,7 @@ def __init__(self, config):
         self.bert = BertModel(config)
         self.cls = BertOnlyMLMHead(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -901,7 +894,7 @@ def __init__(self, config):
         self.bert = BertModel(config)
         self.cls = BertOnlyNSPHead(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
                 position_ids=None, head_mask=None):
@@ -962,7 +955,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                 position_ids=None, head_mask=None):
@@ -1066,7 +1059,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                 position_ids=None, head_mask=None):
@@ -1134,7 +1127,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                 position_ids=None, head_mask=None):
@@ -1208,7 +1201,7 @@ def __init__(self, config):
         self.bert = BertModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 8b39ad372e3bc0..017ad4f7b47819 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -233,25 +233,29 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.pruned_heads = []
+        self.pruned_heads = set()
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
         for head in heads:
-            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
         index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+
         # Prune conv1d layers
         self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
         self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
         # Update hyper params
         self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
         self.n_head = self.n_head - len(heads)
-        self.pruned_heads.extend(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
@@ -357,7 +361,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
 
-    def init_weights(self, module):
+    def _init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
@@ -456,14 +460,7 @@ def __init__(self, config):
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
-        if hasattr(config, "pruned_heads"):
-            pruned_heads = config.pruned_heads.copy().items()
-            config.pruned_heads = {}
-            for layer, heads in pruned_heads:
-                if self.h[int(layer)].attn.n_head == config.n_head:
-                    self.prune_heads({int(layer): list(map(int, heads))})
-
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
@@ -594,7 +591,7 @@ def __init__(self, config):
         self.transformer = GPT2Model(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -718,7 +715,7 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index c69cba49e3e61e..33bcb968b5d8f5 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -202,8 +202,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         config = cls.from_json_file(resolved_config_file)
 
         if hasattr(config, 'pruned_heads'):
-            config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
-
+            config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
 
         # Update config with kwargs if needed
         to_remove = []
@@ -316,7 +315,7 @@ def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
         new_embeddings.to(old_embeddings.weight.device)
 
         # initialize all new embeddings (in particular added tokens)
-        self.init_weights(new_embeddings)
+        self._init_weights(new_embeddings)
 
         # Copy word embeddings from the previous weights
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
@@ -360,36 +359,31 @@ def resize_token_embeddings(self, new_num_tokens=None):
 
         return model_embeds
 
+    def init_weights(self):
+        """ Initialize and prunes weights if needed. """
+        # Initialize weights
+        self.apply(self._init_weights)
+
+        # Prune heads if needed
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
 
             Arguments:
 
                 heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
 
-        to_be_pruned = {}
-
+        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
         for layer, heads in heads_to_prune.items():
-            if int(layer) not in self.config.pruned_heads:
-                self.config.pruned_heads[int(layer)] = heads
-                to_be_pruned[int(layer)] = heads
-            else:
-                for head in heads:
-                    if head not in self.config.pruned_heads[int(layer)]:
-                        self.config.pruned_heads[int(layer)].append(head)
-
-                        if int(layer) in to_be_pruned:
-                            to_be_pruned[int(layer)].append(head)
-                        else:
-                            to_be_pruned[int(layer)] = [head]
-                    else:
-                        logger.warning("Tried to remove head " + str(head) +
-                                       " of layer " + str(layer) +
-                                       " but it was already removed. The current removed heads are " + str(heads_to_prune))
-
-        base_model._prune_heads(to_be_pruned)
+            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
+            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+
+        base_model._prune_heads(heads_to_prune)
 
     def save_pretrained(self, save_directory):
         """ Save a model and its configuration file to a directory, so that it

From b6992b7b476fe7e231c8e144e36582fbbde0b4d4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 00:33:11 -0400
Subject: [PATCH 075/119] Applied patch to OpenAI GPT, RoBERTa, TransfoL, XLM
 and XLNet

---
 pytorch_transformers/modeling_openai.py     | 25 +++++++--------------
 pytorch_transformers/modeling_roberta.py    |  4 ++--
 pytorch_transformers/modeling_transfo_xl.py |  9 +++-----
 pytorch_transformers/modeling_xlm.py        | 17 +++++++-------
 pytorch_transformers/modeling_xlnet.py      | 13 +++++------
 5 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 78e57b0c592ab6..8bf9d86696c154 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -249,14 +249,15 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.pruned_heads = []
+        self.pruned_heads = set()
 
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        heads = set(heads) - self.pruned_heads
         for head in heads:
-            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -267,7 +268,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
         self.n_head = self.n_head - len(heads)
-        self.pruned_heads.extend(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, q, k, v, head_mask=None):
         w = torch.matmul(q, k)
@@ -366,10 +367,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
-
-    def init_weights(self, module):
+    def _init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
@@ -459,14 +457,7 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
 
-        if hasattr(config, "pruned_heads"):
-            pruned_heads = config.pruned_heads.copy().items()
-            config.pruned_heads = {}
-            for layer, heads in pruned_heads:
-                if self.h[int(layer)].attn.n_head == config.n_head:
-                    self.prune_heads({int(layer): list(map(int, heads))})
-
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
@@ -579,7 +570,7 @@ def __init__(self, config):
         self.transformer = OpenAIGPTModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -686,7 +677,7 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index cbd88ab86e899a..6ae5cd1d44048f 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -168,7 +168,7 @@ def __init__(self, config):
         super(RobertaModel, self).__init__(config)
 
         self.embeddings = RobertaEmbeddings(config)
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
         if input_ids[:, 0].sum().item() != 0:
@@ -220,7 +220,7 @@ def __init__(self, config):
         self.roberta = RobertaModel(config)
         self.lm_head = RobertaLMHead(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 283fa66daf7474..0c5c5b779835ff 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -853,9 +853,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
-
     def _init_weight(self, weight):
         if self.config.init == 'uniform':
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
@@ -865,7 +862,7 @@ def _init_weight(self, weight):
     def _init_bias(self, bias):
         nn.init.constant_(bias, 0.0)
 
-    def init_weights(self, m):
+    def _init_weights(self, m):
         """ Initialize the weights.
         """
         classname = m.__class__.__name__
@@ -1059,7 +1056,7 @@ def __init__(self, config):
             self.r_emb = nn.Parameter(torch.FloatTensor(
                     self.n_layer, self.max_klen, self.n_head, self.d_head))
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         return self.word_emb
@@ -1306,7 +1303,7 @@ def __init__(self, config):
         else:
             self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
                                                     config.cutoffs, div_val=config.div_val)
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 17e39528f82dbd..9eff09b362a297 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -271,15 +271,16 @@ def __init__(self, n_heads, dim, config):
         self.k_lin = nn.Linear(dim, dim)
         self.v_lin = nn.Linear(dim, dim)
         self.out_lin = nn.Linear(dim, dim)
-        self.pruned_heads = []
+        self.pruned_heads = set()
 
     def prune_heads(self, heads):
         attention_head_size = self.dim // self.n_heads
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_heads, attention_head_size)
+        heads = set(heads) - self.pruned_heads
         for head in heads:
-            head -= len(list(filter(lambda h: h < head, self.pruned_heads)))
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -291,7 +292,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.n_heads = self.n_heads - len(heads)
         self.dim = attention_head_size * self.n_heads
-        self.pruned_heads.extend(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(self, input, mask, kv=None, cache=None, head_mask=None):
         """
@@ -386,7 +387,7 @@ class XLMPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
 
-    def init_weights(self, module):
+    def _init_weights(self, module):
         """ Initialize the weights. """
         if isinstance(module, nn.Embedding):
             if self.config is not None and self.config.embed_init_std is not None:
@@ -569,7 +570,7 @@ def __init__(self, config):  #, dico, is_encoder, with_output):
                 if self.attentions[int(layer)].n_heads == config.n_heads:
                     self.prune_heads({int(layer): list(map(int, heads))})
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
@@ -781,7 +782,7 @@ def __init__(self, config):
         self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -843,7 +844,7 @@ def __init__(self, config):
         self.transformer = XLMModel(config)
         self.sequence_summary = SequenceSummary(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
@@ -921,7 +922,7 @@ def __init__(self, config):
         self.transformer = XLMModel(config)
         self.qa_outputs = SQuADHead(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, start_positions=None, end_positions=None,
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index cc9c1379a1e976..516e87e99bad2b 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -586,10 +586,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_xlnet
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(XLNetPreTrainedModel, self).__init__(*inputs, **kwargs)
-
-    def init_weights(self, module):
+    def _init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -736,7 +733,7 @@ def __init__(self, config):
         self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
@@ -1037,7 +1034,7 @@ def __init__(self, config):
         self.transformer = XLNetModel(config)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
     def tie_weights(self):
@@ -1114,7 +1111,7 @@ def __init__(self, config):
         self.sequence_summary = SequenceSummary(config)
         self.logits_proj = nn.Linear(config.d_model, config.num_labels)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,
@@ -1216,7 +1213,7 @@ def __init__(self, config):
         self.end_logits = PoolerEndLogits(config)
         self.answer_class = PoolerAnswerClass(config)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,

From 11600edc6e4e6a5ce148ca1d617c9d7e58bc7a7c Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 00:37:41 -0400
Subject: [PATCH 076/119] Rebase on master + DistilBERT head pruning patch

---
 pytorch_transformers/modeling_distilbert.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/modeling_distilbert.py b/pytorch_transformers/modeling_distilbert.py
index 1a0bd2496c5a34..d9a2f1a1770f03 100644
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -174,12 +174,16 @@ def __init__(self, config):
         self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
         self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
 
+        self.pruned_heads = set()
+
     def prune_heads(self, heads):
         attention_head_size = self.dim // self.n_heads
         if len(heads) == 0:
             return
         mask = torch.ones(self.n_heads, attention_head_size)
+        heads = set(heads) - self.pruned_heads
         for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
             mask[head] = 0
         mask = mask.view(-1).contiguous().eq(1)
         index = torch.arange(len(mask))[mask].long()
@@ -191,6 +195,7 @@ def prune_heads(self, heads):
         # Update hyper params
         self.n_heads = self.n_heads - len(heads)
         self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(self, query, key, value, mask, head_mask = None):
         """
@@ -395,7 +400,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
     
-    def init_weights(self, module):
+    def _init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, nn.Embedding):
@@ -480,7 +485,7 @@ def __init__(self, config):
         self.embeddings = Embeddings(config)   # Embeddings
         self.transformer = Transformer(config) # Encoder
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def _resize_token_embeddings(self, new_num_tokens):
         old_embeddings = self.embeddings.word_embeddings
@@ -568,7 +573,7 @@ def __init__(self, config):
         self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
         self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         self.tie_weights()
 
         self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
@@ -642,7 +647,7 @@ def __init__(self, config):
         self.classifier = nn.Linear(config.dim, config.num_labels)
         self.dropout = nn.Dropout(config.seq_classif_dropout)
 
-        self.apply(self.init_weights)
+        self.init_weights()
 
     def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
         distilbert_output = self.distilbert(input_ids=input_ids,
@@ -716,7 +721,7 @@ def __init__(self, config):
         assert config.num_labels == 2
         self.dropout = nn.Dropout(config.qa_dropout)
 
-        self.apply(self.init_weights)
+        self.init_weights()
         
     def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
         distilbert_output = self.distilbert(input_ids=input_ids,

From e0f867a9ba454008ecb830832bcb76c5b821e4fc Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 00:50:59 -0400
Subject: [PATCH 077/119] XLNet bias fix on resize embeddings (cf #1124)

---
 pytorch_transformers/modeling_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 0d4fce67f0c3c3..4550c756203b75 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -327,6 +327,14 @@ def _tie_or_clone_weights(self, first_module, second_module):
         else:
             first_module.weight = second_module.weight
 
+        if hasattr(first_module, 'bias'):
+            first_module.bias.data = torch.nn.functional.pad(
+                first_module.bias.data,
+                (0, first_module.weight.shape[0] - first_module.bias.shape[0]),
+                'constant',
+                0
+            )
+
     def resize_token_embeddings(self, new_num_tokens=None):
         """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
         Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

From ea86bef545f3790faa48f4e002482695641ce490 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 00:56:22 -0400
Subject: [PATCH 078/119] Check for None

---
 pytorch_transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 4550c756203b75..e4b67d80957adb 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -327,7 +327,7 @@ def _tie_or_clone_weights(self, first_module, second_module):
         else:
             first_module.weight = second_module.weight
 
-        if hasattr(first_module, 'bias'):
+        if hasattr(first_module, 'bias') and first_module.bias is not None:
             first_module.bias.data = torch.nn.functional.pad(
                 first_module.bias.data,
                 (0, first_module.weight.shape[0] - first_module.bias.shape[0]),

From a1c34bd286d57625b844703705a39e40737a21ba Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Sat, 31 Aug 2019 12:21:38 +0200
Subject: [PATCH 079/119] distillation: fix ModuleNotFoundError error in token
 counts script

---
 examples/distillation/scripts/token_counts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index d6b6126fb684c6..d791c66be30628 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -19,7 +19,7 @@
 import argparse
 import pickle
 
-from utils import logger
+from examples.distillation.utils import logger
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")

From 58b59a0c316a862cc4ede3f5128fd868d7b4d5b5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 13:17:08 -0400
Subject: [PATCH 080/119] Random seed is accessible anywhere within the common
 tests

---
 pytorch_transformers/tests/modeling_common_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 8b9a2ffd17059d..aaefc582359ea8 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -163,8 +163,9 @@ def test_headmasking(self):
             if not self.test_head_masking:
                 return
 
-            torch.manual_seed(42)
+            global_rng.seed(42)
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            global_rng.seed()
 
             config.output_attentions = True
             config.output_hidden_states = True
@@ -174,7 +175,7 @@ def test_headmasking(self):
                 model.eval()
 
                 # Prepare head_mask
-                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
                 head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
                 head_mask[0, 0] = 0
                 head_mask[-1, :-1] = 0
@@ -551,12 +552,13 @@ def run_common_tests(self):
         self.create_and_test_config_to_json_file()
 
 
+global_rng = random.Random()
 
 
 def ids_tensor(shape, vocab_size, rng=None, name=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
     if rng is None:
-        rng = random.Random()
+        rng = global_rng
 
     total_dims = 1
     for dim in shape:

From 819b468f70b63d7c3e30840df50e1e2dca17faa4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 14:40:51 -0400
Subject: [PATCH 081/119] Fixed XLM model url

---
 pytorch_transformers/modeling_xlm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 035787a97b2c79..d95bd10af72baa 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -44,8 +44,8 @@
     'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",

From 6ae0bb529193670c79d8bbf35fe01df4592b7043 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 31 Aug 2019 14:46:31 -0400
Subject: [PATCH 082/119] XLM 100 different URLs

---
 pytorch_transformers/modeling_xlm.py     | 4 ++--
 pytorch_transformers/tokenization_xlm.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index d95bd10af72baa..51748df6da6113 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -45,7 +45,7 @@
     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
     'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
@@ -57,7 +57,7 @@
     'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
     'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
     'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
 }
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 875f1760abcc7e..8155413f75f696 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -48,7 +48,7 @@
         'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
         'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
         'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
     },
     'merges_file':
     {
@@ -61,7 +61,7 @@
         'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
         'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
         'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
-        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
     },
 }
 

From fede4ef45ddec6e7706548836b3ae2a7728fa93a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 2 Sep 2019 02:27:39 +0200
Subject: [PATCH 083/119] fixing #1133

---
 .../tests/tokenization_tests_commons.py       |  7 ++++--
 pytorch_transformers/tokenization_utils.py    | 22 +++++++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index cdc6cddf00b971..65f45c496c26aa 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -104,7 +104,7 @@ def test_add_tokens_tokenizer(self):
             self.assertNotEqual(vocab_size, 0)
             self.assertEqual(vocab_size, all_size)
 
-            new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
             added_toks = tokenizer.add_tokens(new_toks)
             vocab_size_2 = tokenizer.vocab_size
             all_size_2 = len(tokenizer)
@@ -114,7 +114,9 @@ def test_add_tokens_tokenizer(self):
             self.assertEqual(added_toks, len(new_toks))
             self.assertEqual(all_size_2, all_size + len(new_toks))
 
-            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
+            out_string = tokenizer.decode(tokens)
+
             self.assertGreaterEqual(len(tokens), 4)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
@@ -131,6 +133,7 @@ def test_add_tokens_tokenizer(self):
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
             tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            out_string = tokenizer.decode(tokens)
 
             self.assertGreaterEqual(len(tokens), 6)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 3aed47eb0963ef..4b52409eead632 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -722,7 +722,7 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
                 return self._convert_id_to_token(ids)
         tokens = []
         for index in ids:
-            if index in self.all_special_ids and skip_special_tokens:
+            if skip_special_tokens and index in self.all_special_ids:
                 continue
             if index in self.added_tokens_decoder:
                 tokens.append(self.added_tokens_decoder[index])
@@ -747,7 +747,25 @@ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spa
         Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-        text = self.convert_tokens_to_string(filtered_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separatly for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/pytorch-transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        text = ''.join(sub_texts)
 
         if self._sep_token is not None and self._sep_token in text:
             text = text.replace(self._cls_token, self._sep_token)

From 31d3373bc92d93165bfb0788936939e23a7c5b73 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sun, 1 Sep 2019 21:07:00 -0400
Subject: [PATCH 084/119] Appends space before special token

---
 pytorch_transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 4b52409eead632..53b8d245b86f87 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -760,7 +760,7 @@ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spa
                 if current_sub_text:
                     sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                     current_sub_text = []
-                sub_texts.append(token)
+                sub_texts.append(" " + token)
             else:
                 current_sub_text.append(token)
         if current_sub_text:

From 2dcc5a16291dc959c06ed0fce8d3ddf93a99c98e Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 2 Sep 2019 12:27:11 -0400
Subject: [PATCH 085/119] [doc] Add blurb about large-scale model downloads

cc @n1t0 @lysandrejik @thomwolf
---
 docs/source/installation.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 79d1d74a6adc6a..6512a0cef3be90 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -52,6 +52,12 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT``
 If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
 
 
+Note on model downloads (Continuous Integration or large-scale deployments)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+
+
 Do you want to run a Transformer model on a mobile device?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

From 3fbf301bba44e6c590e554db97de6c60a677e301 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 2 Sep 2019 12:35:14 -0400
Subject: [PATCH 086/119] [CI] Updated resource size for python 3 tests

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a35e20d146c947..48e80beaebbe18 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,7 +4,7 @@ jobs:
         working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:3.5
-        resource_class: large
+        resource_class: xlarge
         parallelism: 1
         steps:
             - checkout

From 7f522437bc878bf76454aee7c86297794f604c56 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 2 Sep 2019 13:40:25 -0400
Subject: [PATCH 087/119] Updated documentation for LM finetuning script

---
 docs/source/examples.rst             |  6 +++++-
 docs/source/model_doc/distilbert.rst | 10 +++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index b444009a6d152e..d9784514382c83 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -459,7 +459,7 @@ The same option as in the original scripts are provided, please refer to the cod
 
 
 Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
 The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
@@ -467,6 +467,8 @@ The following results were obtained using the `raw` WikiText-2 (no tokens were r
 This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
 
 .. code-block:: bash
+
+
     export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
 
     python run_lm_finetuning.py
@@ -485,6 +487,8 @@ This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is
 The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
 
 .. code-block:: bash
+
+
     export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
 
     python run_lm_finetuning.py
diff --git a/docs/source/model_doc/distilbert.rst b/docs/source/model_doc/distilbert.rst
index cc156c90c21cef..141d3e151ffec9 100644
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -2,35 +2,35 @@ DistilBERT
 ----------------------------------------------------
 
 ``DistilBertConfig``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.DistilBertConfig
     :members:
 
 
 ``DistilBertTokenizer``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.DistilBertTokenizer
     :members:
 
 
 ``DistilBertModel``
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.DistilBertModel
     :members:
 
 
 ``DistilBertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.DistilBertForMaskedLM
     :members:
 
 
 ``DistilBertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
     :members:

From a15562e17059ce00a9a376049f2195020ecd392e Mon Sep 17 00:00:00 2001
From: Shijie Wu <wushijie5@gmail.com>
Date: Tue, 3 Sep 2019 18:27:29 -0700
Subject: [PATCH 088/119] Fix reference of import when called for the second
 time

---
 pytorch_transformers/tokenization_xlm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8155413f75f696..f7231384b31af5 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -705,6 +705,8 @@ def _tokenize(self, text, lang='en', bypass_tokenizer=False):
             try:
                 if 'pythainlp' not in sys.modules:
                     from pythainlp.tokenize import word_tokenize as th_word_tokenize
+                else:
+                    th_word_tokenize = sys.modules['pythainlp'].word_tokenize
             except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
                 logger.error("1. pip install pythainlp")
@@ -714,6 +716,8 @@ def _tokenize(self, text, lang='en', bypass_tokenizer=False):
             try:
                 if 'jieba' not in sys.modules:
                     import jieba
+                else:
+                    jieba = sys.modules['jieba']
             except (AttributeError, ImportError) as e:
                 logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
                 logger.error("1. pip install jieba")

From 89fd3450a61b5efd76d2524df2454e0a0e4ca070 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 4 Sep 2019 13:32:18 +0200
Subject: [PATCH 089/119] Release: 1.2.0

---
 pytorch_transformers/__init__.py | 2 +-
 setup.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index eb356f2165f417..3e8719bd8d7590 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.0"
+__version__ = "1.2.0"
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
diff --git a/setup.py b/setup.py
index 297972226817f0..c31bf6eaeb376c 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="pytorch_transformers",
-    version="1.1.0",
+    version="1.2.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",

From 0b52642d379bed155e8aa4f4088588bfd8ceaa88 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Sep 2019 11:03:32 -0400
Subject: [PATCH 090/119] 1.2.0 in docs

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 79df358631eaf3..cdca1d82d0c39c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.0.0'
+release = u'1.2.0'
 
 
 # -- General configuration ---------------------------------------------------

From 38b79b5a63b1cfc1b5c8b43b8462cae9abe92d3e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 4 Sep 2019 22:36:30 +0200
Subject: [PATCH 091/119] Fixing this TransformerXL bool issue

---
 pytorch_transformers/modeling_transfo_xl.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 0c5c5b779835ff..e3343222903dc9 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -423,7 +423,8 @@ def forward(self, h, attn_mask=None, mems=None, head_mask=None):
         # [qlen x klen x bsz x n_head]
         attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
         attn_score.mul_(self.scale)
-        if attn_mask is not None and attn_mask.any().item():
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
             if attn_mask.dim() == 2:
                 attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
             elif attn_mask.dim() == 3:
@@ -586,7 +587,8 @@ def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
         attn_score.mul_(self.scale)
 
         #### compute attention probability
-        if attn_mask is not None and attn_mask.any().item():
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
             if attn_mask.dim() == 2:
                 attn_score = attn_score.float().masked_fill(
                     attn_mask[None,:,:,None], -1e30).type_as(attn_score)
@@ -680,7 +682,8 @@ def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_ma
         attn_score.mul_(self.scale)
 
         #### compute attention probability
-        if attn_mask is not None and attn_mask.any().item():
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
             if attn_mask.dim() == 2:
                 attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
             elif attn_mask.dim() == 3:
@@ -1139,10 +1142,10 @@ def _forward(self, dec_inp, mems=None, head_mask=None):
             else:
                 mask_shift_len = qlen
             dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-                    + torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1
+                    + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
         else:
             dec_attn_mask = torch.triu(
-                word_emb.new_ones(qlen, klen), diagonal=1+mlen).bool()[:,:,None]
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen)[:,:,None]
 
         hids = []
         attentions = []

From e25cba78cff6502024983ae62d129e0d09711601 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 4 Sep 2019 12:43:18 +0200
Subject: [PATCH 092/119] WIP reodering arguments for torchscript and TF

---
 pytorch_transformers/modeling_bert.py         | 151 +++++++++---------
 pytorch_transformers/modeling_distilbert.py   |  18 +--
 pytorch_transformers/modeling_gpt2.py         | 109 ++++++++-----
 pytorch_transformers/modeling_openai.py       |  83 ++++++----
 pytorch_transformers/modeling_roberta.py      |  47 ++++--
 pytorch_transformers/modeling_transfo_xl.py   |   2 +-
 pytorch_transformers/modeling_xlm.py          |  73 +++++----
 pytorch_transformers/modeling_xlnet.py        |  73 +++++----
 .../tests/modeling_bert_test.py               |  24 +--
 .../tests/modeling_distilbert_test.py         |   4 +-
 10 files changed, 331 insertions(+), 253 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index e2d83460713504..5c71bedba9d779 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -596,18 +596,18 @@ def _init_weights(self, module):
             Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Segment token indices to indicate first and second portions of the inputs.
             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
             corresponds to a `sentence B` token
             (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -668,7 +668,7 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -771,10 +771,14 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
-                next_sentence_label=None, position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                masked_lm_labels=None, next_sentence_label=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
@@ -839,10 +843,14 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
-                position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                masked_lm_labels=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -896,10 +904,15 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
-                position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                next_sentence_label=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
         pooled_output = outputs[1]
 
         seq_relationship_score = self.cls(pooled_output)
@@ -957,10 +970,15 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -983,45 +1001,9 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No
 
 @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING)
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
     r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
-                ``token_type_ids:   0   0   0   0  0     0   0``
-    
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the multiple choice classification loss.
             Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
@@ -1061,16 +1043,21 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
         num_choices = input_ids.shape[1]
 
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids,
+                            head_mask=head_mask)
+
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -1129,10 +1116,15 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
         sequence_output = outputs[0]
 
         sequence_output = self.dropout(sequence_output)
@@ -1203,10 +1195,15 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
-                end_positions=None, position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                start_positions=None, end_positions=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
         sequence_output = outputs[0]
 
         logits = self.qa_outputs(sequence_output)
diff --git a/pytorch_transformers/modeling_distilbert.py b/pytorch_transformers/modeling_distilbert.py
index d9a2f1a1770f03..280270381cb79d 100644
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -585,10 +585,10 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.vocab_projector,
                                    self.distilbert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
         dlbrt_output = self.distilbert(input_ids=input_ids,
-                                    attention_mask=attention_mask,
-                                    head_mask=head_mask)
+                                       attention_mask=attention_mask,
+                                       head_mask=head_mask)
         hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
         prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
         prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
@@ -649,10 +649,10 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
+    def forward(self, input_ids,  attention_mask=None, head_mask=None, labels=None):
         distilbert_output = self.distilbert(input_ids=input_ids,
-                                      attention_mask=attention_mask,
-                                      head_mask=head_mask)
+                                            attention_mask=attention_mask,
+                                            head_mask=head_mask)
         hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
         pooled_output = hidden_state[:, 0]                    # (bs, dim)
         pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
@@ -723,10 +723,10 @@ def __init__(self, config):
 
         self.init_weights()
         
-    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
         distilbert_output = self.distilbert(input_ids=input_ids,
-                                      attention_mask=attention_mask,
-                                      head_mask=head_mask)
+                                            attention_mask=attention_mask,
+                                            head_mask=head_mask)
         hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
 
         hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 017ad4f7b47819..38f67338fcca01 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -257,7 +257,7 @@ def prune_heads(self, heads):
         self.n_head = self.n_head - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _attn(self, q, k, v, head_mask=None):
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -265,6 +265,10 @@ def _attn(self, q, k, v, head_mask=None):
         b = self.bias[:, :, ns-nd:ns, :ns]
         w = w * b - 1e4 * (1 - b)
 
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
 
@@ -290,7 +294,7 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
 
-    def forward(self, x, layer_past=None, head_mask=None):
+    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
@@ -302,7 +306,7 @@ def forward(self, x, layer_past=None, head_mask=None):
             value = torch.cat((past_value, value), dim=-2)
         present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
 
-        attn_outputs = self._attn(query, key, value, head_mask)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
         a = attn_outputs[0]
 
         a = self.merge_heads(a)
@@ -337,8 +341,11 @@ def __init__(self, n_ctx, config, scale=False):
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
-    def forward(self, x, layer_past=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x),
+                                layer_past=layer_past,
+                                attention_mask=attention_mask,
+                                head_mask=head_mask)
         a = output_attn[0]  # output_attn: a, present, (attentions)
 
         x = x + a
@@ -404,17 +411,21 @@ def _init_weights(self, module):
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
         **past**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -473,7 +484,7 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -483,6 +494,23 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
             position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Attention mask.
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -520,7 +548,11 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None,
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
-            outputs = block(hidden_states, layer_past, head_mask[i])
+            outputs = block(hidden_states,
+                            past=layer_past,
+                            attention_mask=attention_mask,
+                            head_mask=head_mask[i])
+
             hidden_states, present = outputs[:2]
             presents = presents + (present,)
 
@@ -601,9 +633,14 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.wte)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               past=past, head_mask=head_mask)
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               past=past,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               head_mask=head_mask)
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -626,33 +663,12 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", GPT2_START_DOCSTRING)
+""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    r"""    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+    r"""
         **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
             Index of the classification token in each input sequence.
             Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
@@ -725,10 +741,15 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.wte)
 
-    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
-                position_ids=None, past=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               past=past, head_mask=head_mask)
+    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                mc_token_ids=None, lm_labels=None, mc_labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               past=past,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               head_mask=head_mask)
+
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 8bf9d86696c154..55f0a4e8d2bab5 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -270,7 +270,7 @@ def prune_heads(self, heads):
         self.n_head = self.n_head - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _attn(self, q, k, v, head_mask=None):
+    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
@@ -279,6 +279,10 @@ def _attn(self, q, k, v, head_mask=None):
         b = self.bias[:, :, : w.size(-2), : w.size(-1)]
         w = w * b + -1e9 * (1 - b)
 
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+
         w = nn.Softmax(dim=-1)(w)
         w = self.attn_dropout(w)
 
@@ -304,14 +308,14 @@ def split_heads(self, x, k=False):
         else:
             return x.permute(0, 2, 1, 3)
 
-    def forward(self, x, head_mask=None):
+    def forward(self, x, attention_mask=None, head_mask=None):
         x = self.c_attn(x)
         query, key, value = x.split(self.split_size, dim=2)
         query = self.split_heads(query)
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
 
-        attn_outputs = self._attn(query, key, value, head_mask)
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
         a = attn_outputs[0]
 
         a = self.merge_heads(a)
@@ -346,8 +350,8 @@ def __init__(self, n_ctx, config, scale=False):
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
-    def forward(self, x, head_mask=None):
-        attn_outputs = self.attn(x, head_mask=head_mask)
+    def forward(self, x, attention_mask=None, head_mask=None):
+        attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
         a = attn_outputs[0]
 
         n = self.ln_1(x + a)
@@ -410,13 +414,17 @@ def _init_weights(self, module):
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
             The embeddings from these tokens will be summed with the respective token embeddings.
             Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -470,7 +478,7 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
             # start = self.config.vocab_size + self.config.n_special
@@ -479,6 +487,23 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
             position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
+        # Attention mask.
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -515,7 +540,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=N
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
-            outputs = block(hidden_states, head_mask[i])
+            outputs = block(hidden_states, attention_mask, head_mask[i])
             hidden_states = outputs[0]
             if self.output_attentions:
                 all_attentions = all_attentions + (outputs[1],)
@@ -580,8 +605,12 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.tokens_embed)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
                                                head_mask=head_mask)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
@@ -604,29 +633,12 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
-""", OPENAI_GPT_START_DOCSTRING)
+""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    r"""    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+    r"""
         **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
             Index of the classification token in each input sequence.
             Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
@@ -687,9 +699,12 @@ def tie_weights(self):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.tokens_embed)
 
-    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
-                position_ids=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                lm_labels=None, mc_labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
                                                head_mask=head_mask)
         hidden_states = transformer_outputs[0]
 
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 6ae5cd1d44048f..0694e76415813e 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -61,7 +61,9 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None):
             # cf. fairseq's `utils.make_positions`
             position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
+        return super(RobertaEmbeddings, self).forward(input_ids,
+                                                      token_type_ids=token_type_ids,
+                                                      position_ids=position_ids)
 
 
 class RobertaConfig(BertConfig):
@@ -116,13 +118,20 @@ class RobertaConfig(BertConfig):
 
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1[``.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -170,12 +179,16 @@ def __init__(self, config):
         self.embeddings = RobertaEmbeddings(config)
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if input_ids[:, 0].sum().item() != 0:
             logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
                            "This model requires special tokens in order to work. "
                            "Please specify add_special_tokens=True in your encoding.")
-        return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
+        return super(RobertaModel, self).forward(input_ids,
+                                                 attention_mask=attention_mask,
+                                                 token_type_ids=token_type_ids,
+                                                 position_ids=position_ids,
+                                                 head_mask=head_mask)
 
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
@@ -229,10 +242,13 @@ def tie_weights(self):
         """
         self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
-                head_mask=None):
-        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                masked_lm_labels=None):
+        outputs = self.roberta(input_ids,
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
@@ -313,10 +329,13 @@ def __init__(self, config):
         self.roberta = RobertaModel(config)
         self.classifier = RobertaClassificationHead(config)
     
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None):
-        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                labels=None):
+        outputs = self.roberta(input_ids,
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
         sequence_output = outputs[0]
         logits = self.classifier(sequence_output)
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index e3343222903dc9..63303dae5e91b6 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -1342,7 +1342,7 @@ def reset_length(self, tgt_len, ext_len, mem_len):
     def init_mems(self, data):
         return self.transformer.init_mems(data)
 
-    def forward(self, input_ids, labels=None, mems=None, head_mask=None):
+    def forward(self, input_ids, mems=None, head_mask=None, labels=None):
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 847a3ecfb7d75d..7e13f104950822 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -441,23 +441,23 @@ def _init_weights(self, module):
             Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens to be used to indicate the language of each token in the input.
             Indices are languages ids which can be obtained from the language names by using two conversion mappings
             provided in the configuration of the model (only provided for multilingual models).
             More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
             the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Length of each sentence that can be used to avoid performing attention on padding token indices.
             You can also use `attention_mask` for the same result (see above), kept here for compatbility.
@@ -584,8 +584,8 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.attentions[layer].prune_heads(heads)
 
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
-                token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None):  # removed: src_enc=None, src_len=None
         if lengths is None:
             lengths = (input_ids != self.pad_index).sum(dim=1).long()
         # mask = input_ids != self.pad_index
@@ -790,11 +790,16 @@ def tie_weights(self):
         """
         self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
 
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
-                attention_mask=None, cache=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
 
         output = transformer_outputs[0]
         outputs = self.pred_layer(output, labels)
@@ -846,11 +851,16 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
-                attention_mask=None, cache=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
 
         output = transformer_outputs[0]
         logits = self.sequence_summary(output)
@@ -924,12 +934,17 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
-                attention_mask=None, cache=None, start_positions=None, end_positions=None,
-                cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
-                                               token_type_ids=token_type_ids, langs=langs,
-                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
+                is_impossible=None, cls_index=None, p_mask=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
 
         output = transformer_outputs[0]
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 516e87e99bad2b..0dfb33a27abdb2 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -647,21 +647,10 @@ def _init_weights(self, module):
             Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
-            Kept for compatibility with the original code base.
-            You can only uses one of `input_mask` and `attention_mask`
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
         **mems**: (`optional`)
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
@@ -679,6 +668,17 @@ def _init_weights(self, module):
             Mask to indicate the output tokens to use.
             If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
             Only used during pretraining for partial prediction or for sequential decoding (generation).
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
+            Kept for compatibility with the original code base.
+            You can only uses one of `input_mask` and `attention_mask`
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -837,8 +837,8 @@ def relative_positional_encoding(self, qlen, klen, bsz=None):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None):
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
@@ -1042,12 +1042,15 @@ def tie_weights(self):
         """
         self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
 
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
-                                               input_mask=input_mask, attention_mask=attention_mask,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               mems=mems,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                                head_mask=head_mask)
 
         logits = self.lm_loss(transformer_outputs[0])
@@ -1113,12 +1116,15 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
-                                               input_mask=input_mask, attention_mask=attention_mask,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               mems=mems,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                                head_mask=head_mask)
         output = transformer_outputs[0]
 
@@ -1215,13 +1221,16 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None,
-                start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
-                head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
-                                               input_mask=input_mask, attention_mask=attention_mask,
-                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None,
+                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               mems=mems,
+                                               perm_mask=perm_mask,
+                                               target_mapping=target_mapping,
+                                               token_type_ids=token_type_ids,
+                                               input_mask=input_mask, 
                                                head_mask=head_mask)
         hidden_states = transformer_outputs[0]
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index ac5d2636a9cf39..e1cce38479a975 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -126,8 +126,8 @@ def check_loss_output(self, result):
         def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
             model.eval()
-            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
-            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids)
 
             result = {
@@ -143,7 +143,7 @@ def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_m
         def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
-            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
@@ -156,7 +156,7 @@ def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids,
         def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
             model.eval()
-            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
             result = {
                 "loss": loss,
                 "seq_relationship_score": seq_relationship_score,
@@ -170,7 +170,8 @@ def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids,
         def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
             model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
+            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
@@ -188,7 +189,8 @@ def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_id
         def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                   start_positions=sequence_labels, end_positions=sequence_labels)
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
@@ -207,7 +209,7 @@ def create_and_check_bert_for_sequence_classification(self, config, input_ids, t
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
             model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
                 "loss": loss,
                 "logits": logits,
@@ -222,7 +224,7 @@ def create_and_check_bert_for_token_classification(self, config, input_ids, toke
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
             model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
             result = {
                 "loss": loss,
                 "logits": logits,
@@ -241,9 +243,9 @@ def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_typ
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             loss, logits = model(multiple_choice_inputs_ids,
-                         multiple_choice_token_type_ids,
-                         multiple_choice_input_mask,
-                         choice_labels)
+                                 attention_mask=multiple_choice_input_mask,
+                                 token_type_ids=multiple_choice_token_type_ids,
+                                 labels=choice_labels)
             result = {
                 "loss": loss,
                 "logits": logits,
diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py
index 1c9d9c792dd6e5..7b0b6b8266cbe3 100644
--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -148,7 +148,7 @@ def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mas
         def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertForQuestionAnswering(config=config)
             model.eval()
-            loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
             result = {
                 "loss": loss,
                 "start_logits": start_logits,
@@ -166,7 +166,7 @@ def create_and_check_distilbert_for_sequence_classification(self, config, input_
             config.num_labels = self.num_labels
             model = DistilBertForSequenceClassification(config)
             model.eval()
-            loss, logits = model(input_ids, input_mask, sequence_labels)
+            loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
             result = {
                 "loss": loss,
                 "logits": logits,

From 7fba47b7d98d918c6dac5dcc50123ebb8eec9685 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 4 Sep 2019 22:29:17 +0200
Subject: [PATCH 093/119] WIP reordering

---
 pytorch_transformers/modeling_gpt2.py               | 4 ++--
 pytorch_transformers/modeling_openai.py             | 6 +++---
 pytorch_transformers/tests/modeling_roberta_test.py | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 38f67338fcca01..d16448beaac09f 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -549,7 +549,7 @@ def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None
                 all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
             outputs = block(hidden_states,
-                            past=layer_past,
+                            layer_past=layer_past,
                             attention_mask=attention_mask,
                             head_mask=head_mask[i])
 
@@ -666,7 +666,7 @@ def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None
 """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     r"""
-        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
             Index of the classification token in each input sequence.
             Selected in the range ``[0, input_ids.size(-1) - 1[``.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 55f0a4e8d2bab5..4fbec7a768bb85 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -636,7 +636,7 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_
 """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     r"""
-        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
             Index of the classification token in each input sequence.
             Selected in the range ``[0, input_ids.size(-1) - 1[``.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -678,7 +678,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
         mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, mc_token_ids)
+        outputs = model(input_ids, mc_token_ids=mc_token_ids)
         lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
@@ -700,7 +700,7 @@ def tie_weights(self):
                                    self.transformer.tokens_embed)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                lm_labels=None, mc_labels=None):
+                mc_token_ids=None, lm_labels=None, mc_labels=None):
         transformer_outputs = self.transformer(input_ids,
                                                attention_mask=attention_mask,
                                                token_type_ids=token_type_ids,
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index 94035e9667bca5..0279f3756bfdef 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -123,8 +123,8 @@ def create_and_check_roberta_model(self, config, input_ids, token_type_ids, inpu
                                            token_labels, choice_labels):
             model = RobertaModel(config=config)
             model.eval()
-            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
-            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids)
 
             result = {
@@ -140,7 +140,7 @@ def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_i
                                                    token_labels, choice_labels):
             model = RobertaForMaskedLM(config=config)
             model.eval()
-            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,

From 0be6a2a62452648b8d8b5202a7ec3cb87063d894 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 4 Sep 2019 22:47:38 +0200
Subject: [PATCH 094/119] be sure we have uint8

---
 pytorch_transformers/modeling_transfo_xl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index e3343222903dc9..9e2808f218f27c 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -1135,7 +1135,7 @@ def _forward(self, dec_inp, mems=None, head_mask=None):
         mlen = mems[0].size(0) if mems is not None else 0
         klen = mlen + qlen
         if self.same_length:
-            all_ones = word_emb.new_ones(qlen, klen)
+            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
             mask_len = klen - self.mem_len
             if mask_len > 0:
                 mask_shift_len = qlen - mask_len
@@ -1145,7 +1145,7 @@ def _forward(self, dec_inp, mems=None, head_mask=None):
                     + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
         else:
             dec_attn_mask = torch.triu(
-                word_emb.new_ones(qlen, klen), diagonal=1+mlen)[:,:,None]
+                word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
 
         hids = []
         attentions = []

From 2a667b1eb979a406642eb03fd234211b0bf0aa41 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Sep 2019 00:26:57 +0200
Subject: [PATCH 095/119] split configuration and modeling files

---
 pytorch_transformers/__init__.py              |  64 +++---
 pytorch_transformers/configuration_auto.py    | 135 ++++++++++++
 pytorch_transformers/configuration_bert.py    | 113 ++++++++++
 .../configuration_distilbert.py               |  89 ++++++++
 pytorch_transformers/configuration_gpt2.py    | 143 ++++++++++++
 pytorch_transformers/configuration_openai.py  | 135 ++++++++++++
 pytorch_transformers/configuration_roberta.py |  35 +++
 .../configuration_transfo_xl.py               | 167 ++++++++++++++
 pytorch_transformers/configuration_utils.py   | 207 ++++++++++++++++++
 pytorch_transformers/configuration_xlm.py     | 184 ++++++++++++++++
 pytorch_transformers/configuration_xlnet.py   | 172 +++++++++++++++
 pytorch_transformers/file_utils.py            |  24 ++
 pytorch_transformers/modeling_auto.py         | 125 +----------
 pytorch_transformers/modeling_bert.py         |  93 +-------
 pytorch_transformers/modeling_distilbert.py   |  67 +-----
 pytorch_transformers/modeling_gpt2.py         | 132 +----------
 pytorch_transformers/modeling_openai.py       | 119 +---------
 pytorch_transformers/modeling_roberta.py      |  20 +-
 pytorch_transformers/modeling_transfo_xl.py   | 151 +------------
 pytorch_transformers/modeling_utils.py        | 205 +----------------
 pytorch_transformers/modeling_xlm.py          | 166 +-------------
 pytorch_transformers/modeling_xlnet.py        | 151 +------------
 .../tests/configuration_common_test.py        |  63 ++++++
 .../tests/modeling_auto_test.py               |   3 +-
 .../tests/modeling_bert_test.py               |   3 +-
 .../tests/modeling_common_test.py             |   6 +-
 .../tests/modeling_distilbert_test.py         |   4 +-
 .../tests/modeling_gpt2_test.py               |   3 +-
 .../tests/modeling_openai_test.py             |   3 +-
 .../tests/modeling_roberta_test.py            |   3 +-
 .../tests/modeling_transfo_xl_test.py         |   3 +-
 .../tests/modeling_xlm_test.py                |   3 +-
 .../tests/modeling_xlnet_test.py              |   3 +-
 33 files changed, 1571 insertions(+), 1223 deletions(-)
 create mode 100644 pytorch_transformers/configuration_auto.py
 create mode 100644 pytorch_transformers/configuration_bert.py
 create mode 100644 pytorch_transformers/configuration_distilbert.py
 create mode 100644 pytorch_transformers/configuration_gpt2.py
 create mode 100644 pytorch_transformers/configuration_openai.py
 create mode 100644 pytorch_transformers/configuration_roberta.py
 create mode 100644 pytorch_transformers/configuration_transfo_xl.py
 create mode 100644 pytorch_transformers/configuration_utils.py
 create mode 100644 pytorch_transformers/configuration_xlm.py
 create mode 100644 pytorch_transformers/configuration_xlnet.py
 create mode 100644 pytorch_transformers/tests/configuration_common_test.py

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 3e8719bd8d7590..04a73c3abca7bd 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,7 @@
 __version__ = "1.2.0"
+
+# Tokenizer
+from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
@@ -9,46 +12,51 @@
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 
-from .tokenization_utils import (PreTrainedTokenizer)
+# Configurations
+from .configuration_utils import CONFIG_NAME, PretrainedConfig
+from .configuration_auto import AutoConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+# Modeling
+from .modeling_utils import (WEIGHTS_NAME, TF_WEIGHTS_NAME, PreTrainedModel, prune_layer, Conv1D)
+from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                             AutoModelWithLMHead)
 
-from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
+from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                             BertForMaskedLM, BertForNextSentencePrediction,
                             BertForSequenceClassification, BertForMultipleChoice,
                             BertForTokenClassification, BertForQuestionAnswering,
-                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                            BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
-from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
+                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
+                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
+                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlnet import (XLNetConfig,
-                             XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
-                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
+                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
-                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
-                               ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel,
+                           XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
+                               ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
-                               DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
-                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
+                               DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 
+# Optimization
 from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 
-from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
+# Files and general utilities
+from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path, add_start_docstrings, add_end_docstrings)
diff --git a/pytorch_transformers/configuration_auto.py b/pytorch_transformers/configuration_auto.py
new file mode 100644
index 00000000000000..9e35f85dc74808
--- /dev/null
+++ b/pytorch_transformers/configuration_auto.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .configuration_bert import BertConfig
+from .configuration_openai import OpenAIGPTConfig
+from .configuration_gpt2 import GPT2Config
+from .configuration_transfo_xl import TransfoXLConfig
+from .configuration_xlnet import XLNetConfig
+from .configuration_xlm import XLMConfig
+from .configuration_roberta import RobertaConfig
+from .configuration_distilbert import DistilBertConfig
+
+logger = logging.getLogger(__name__)
+
+
+class AutoConfig(object):
+    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+        that will be instantiated as one of the configuration classes of the library
+        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a one of the configuration classes of the library
+        from a pre-trained model configuration.
+
+        The configuration class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/pytorch_transformers/configuration_bert.py b/pytorch_transformers/configuration_bert.py
new file mode 100644
index 00000000000000..7fff3e5d058720
--- /dev/null
+++ b/pytorch_transformers/configuration_bert.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 **kwargs):
+        super(BertConfig, self).__init__(**kwargs)
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
diff --git a/pytorch_transformers/configuration_distilbert.py b/pytorch_transformers/configuration_distilbert.py
new file mode 100644
index 00000000000000..b8929eedec7633
--- /dev/null
+++ b/pytorch_transformers/configuration_distilbert.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DistilBERT model configuration """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+}
+
+
+class DistilBertConfig(PretrainedConfig):
+    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 max_position_embeddings=512,
+                 sinusoidal_pos_embds=True,
+                 n_layers=6,
+                 n_heads=12,
+                 dim=768,
+                 hidden_dim=4*768,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 activation='gelu',
+                 initializer_range=0.02,
+                 tie_weights_=True,
+                 qa_dropout=0.1,
+                 seq_classif_dropout=0.2,
+                 **kwargs):
+        super(DistilBertConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_position_embeddings = max_position_embeddings
+            self.sinusoidal_pos_embds = sinusoidal_pos_embds
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dim = dim
+            self.hidden_dim = hidden_dim
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.activation = activation
+            self.initializer_range = initializer_range
+            self.tie_weights_ = tie_weights_
+            self.qa_dropout = qa_dropout
+            self.seq_classif_dropout = seq_classif_dropout
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+    @property
+    def hidden_size(self):
+        return self.dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
diff --git a/pytorch_transformers/configuration_gpt2.py b/pytorch_transformers/configuration_gpt2.py
new file mode 100644
index 00000000000000..c83d9e82cef82f
--- /dev/null
+++ b/pytorch_transformers/configuration_gpt2.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT-2 configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
+
+class GPT2Config(PretrainedConfig):
+    """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+    """
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(GPT2Config, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_openai.py b/pytorch_transformers/configuration_openai.py
new file mode 100644
index 00000000000000..b27df5689982ad
--- /dev/null
+++ b/pytorch_transformers/configuration_openai.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
+}
+
+class OpenAIGPTConfig(PretrainedConfig):
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+    """
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_positions=512,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        predict_special_tokens=True,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs OpenAIGPTConfig.
+        """
+        super(OpenAIGPTConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_roberta.py b/pytorch_transformers/configuration_roberta.py
new file mode 100644
index 00000000000000..b92d6a908ba625
--- /dev/null
+++ b/pytorch_transformers/configuration_roberta.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_bert import BertConfig
+
+logger = logging.getLogger(__name__)
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+}
+
+
+class RobertaConfig(BertConfig):
+    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/pytorch_transformers/configuration_transfo_xl.py b/pytorch_transformers/configuration_transfo_xl.py
new file mode 100644
index 00000000000000..2e966ee55cf458
--- /dev/null
+++ b/pytorch_transformers/configuration_transfo_xl.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Transformer XL configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+}
+
+class TransfoXLConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `TransfoXLModel`.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tie_weight: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
+    """
+    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=267735,
+                 cutoffs=[20000, 40000, 200000],
+                 d_model=1024,
+                 d_embed=1024,
+                 n_head=16,
+                 d_head=64,
+                 d_inner=4096,
+                 div_val=4,
+                 pre_lnorm=False,
+                 n_layer=18,
+                 tgt_len=128,
+                 ext_len=0,
+                 mem_len=1600,
+                 clamp_len=1000,
+                 same_length=True,
+                 proj_share_all_but_first=True,
+                 attn_type=0,
+                 sample_softmax=-1,
+                 adaptive=True,
+                 tie_weight=True,
+                 dropout=0.1,
+                 dropatt=0.0,
+                 untie_r=True,
+                 init="normal",
+                 init_range=0.01,
+                 proj_init_std=0.01,
+                 init_std=0.02,
+                 **kwargs):
+        """Constructs TransfoXLConfig.
+        """
+        super(TransfoXLConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.cutoffs = []
+            self.cutoffs.extend(cutoffs)
+            self.tie_weight = tie_weight
+            if proj_share_all_but_first:
+                self.tie_projs = [False] + [True] * len(self.cutoffs)
+            else:
+                self.tie_projs = [False] + [False] * len(self.cutoffs)
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.pre_lnorm = pre_lnorm
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.tgt_len = tgt_len
+            self.ext_len = ext_len
+            self.mem_len = mem_len
+            self.same_length = same_length
+            self.attn_type = attn_type
+            self.clamp_len = clamp_len
+            self.sample_softmax = sample_softmax
+            self.adaptive = adaptive
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.untie_r = untie_r
+            self.init = init
+            self.init_range = init_range
+            self.proj_init_std = proj_init_std
+            self.init_std = init_std
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return self.tgt_len + self.ext_len + self.mem_len
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py
new file mode 100644
index 00000000000000..550b47fab84cde
--- /dev/null
+++ b/pytorch_transformers/configuration_utils.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+CONFIG_NAME = "config.json"
+
+class PretrainedConfig(object):
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+
+        Parameters:
+            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
+            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
+            ``torchscript``: string, default `False`. Is the model used with Torchscript.
+    """
+    pretrained_config_archive_map = {}
+
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+        self.torchscript = kwargs.pop('torchscript', False)
+        self.pruned_heads = kwargs.pop('pruned_heads', {})
+
+    def save_pretrained(self, save_directory):
+        """ Save a configuration object to the directory `save_directory`, so that it
+            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        else:
+            config_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            raise e
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.from_json_file(resolved_config_file)
+
+        if hasattr(config, 'pruned_heads'):
+            config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config %s", config)
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Config` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
diff --git a/pytorch_transformers/configuration_xlm.py b/pytorch_transformers/configuration_xlm.py
new file mode 100644
index 00000000000000..ab251c8939e0ec
--- /dev/null
+++ b/pytorch_transformers/configuration_xlm.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
+}
+
+
+class XLMConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `XLMModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+    """
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30145,
+                 emb_dim=2048,
+                 n_layers=12,
+                 n_heads=16,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 gelu_activation=True,
+                 sinusoidal_embeddings=False,
+                 causal=False,
+                 asm=False,
+                 n_langs=1,
+                 use_lang_emb=True,
+                 max_position_embeddings=512,
+                 embed_init_std=2048 ** -0.5,
+                 layer_norm_eps=1e-12,
+                 init_std=0.02,
+                 bos_index=0,
+                 eos_index=1,
+                 pad_index=2,
+                 unk_index=3,
+                 mask_index=5,
+                 is_encoder=True,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='first',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLMConfig.
+        """
+        super(XLMConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_words = vocab_size_or_config_json_file
+            self.emb_dim = emb_dim
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.causal = causal
+            self.asm = asm
+            self.n_langs = n_langs
+            self.use_lang_emb = use_lang_emb
+            self.layer_norm_eps = layer_norm_eps
+            self.bos_index = bos_index
+            self.eos_index = eos_index
+            self.pad_index = pad_index
+            self.unk_index = unk_index
+            self.mask_index = mask_index
+            self.is_encoder = is_encoder
+            self.max_position_embeddings = max_position_embeddings
+            self.embed_init_std = embed_init_std
+            self.init_std = init_std
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def vocab_size(self):
+        return self.n_words
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
+
+    @property
+    def hidden_size(self):
+        return self.emb_dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
diff --git a/pytorch_transformers/configuration_xlnet.py b/pytorch_transformers/configuration_xlnet.py
new file mode 100644
index 00000000000000..204d44aa7281ca
--- /dev/null
+++ b/pytorch_transformers/configuration_xlnet.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLNet configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+}
+
+
+class XLNetConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a ``XLNetModel``.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+        finetuning_task: name of the glue task on which the model was fine-tuned if any
+    """
+    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=32000,
+                 d_model=1024,
+                 n_layer=24,
+                 n_head=16,
+                 d_inner=4096,
+                 ff_activation="gelu",
+                 untie_r=True,
+                 attn_type="bi",
+
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+
+                 dropout=0.1,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='last',
+                 summary_use_proj=True,
+                 summary_activation='tanh',
+                 summary_last_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLNetConfig.
+        """
+        super(XLNetConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.d_model = d_model
+            self.n_layer = n_layer
+            self.n_head = n_head
+            assert d_model % n_head == 0
+            self.d_head = d_model // n_head
+            self.ff_activation = ff_activation
+            self.d_inner = d_inner
+            self.untie_r = untie_r
+            self.attn_type = attn_type
+
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+
+            self.dropout = dropout
+            self.mem_len = mem_len
+            self.reuse_len = reuse_len
+            self.bi_data = bi_data
+            self.clamp_len = clamp_len
+            self.same_length = same_length
+
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_last_dropout = summary_last_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return -1
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index f6f2151b12430a..37ebc57fb3cd7c 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -9,6 +9,7 @@
 import json
 import logging
 import os
+import six
 import shutil
 import tempfile
 import fnmatch
@@ -49,6 +50,29 @@
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
+if not six.PY2:
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            return fn
+        return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
+else:
+    # Not possible to update class docstrings on python2
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
 
 def url_to_filename(url, etag=None):
     """
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 05ff5e5b33cf45..31c8fafaa90fa0 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,123 +18,20 @@
 
 import logging
 
-from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel
-from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
-from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
-from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
-from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertConfig, DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
 
-from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings
+from .modeling_utils import PreTrainedModel, SequenceSummary
 
-logger = logging.getLogger(__name__)
-
-
-class AutoConfig(object):
-    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method take care of returning the correct model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
-
-        This class cannot be instantiated using `__init__()` (throw an error).
-    """
-    def __init__(self):
-        raise EnvironmentError("AutoConfig is designed to be instantiated "
-            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a one of the configuration classes of the library
-        from a pre-trained model configuration.
-
-        The configuration class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
-
-                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+from .file_utils import add_start_docstrings
 
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            return_unused_kwargs: (`optional`) bool:
-
-                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-
-        Examples::
-
-            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
-            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        if 'distilbert' in pretrained_model_name_or_path:
-            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'bert' in pretrained_model_name_or_path:
-            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'openai-gpt' in pretrained_model_name_or_path:
-            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'gpt2' in pretrained_model_name_or_path:
-            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'transfo-xl' in pretrained_model_name_or_path:
-            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlnet' in pretrained_model_name_or_path:
-            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'xlm' in pretrained_model_name_or_path:
-            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+logger = logging.getLogger(__name__)
 
 
 class AutoModel(object):
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index e2d83460713504..1d891830197772 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -28,8 +28,9 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel,
-                             prune_linear_layer, add_start_docstrings)
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_bert import BertConfig
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
@@ -49,23 +50,6 @@
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 
-BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-}
-
-
 def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model.
     """
@@ -149,77 +133,6 @@ def swish(x):
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
-class BertConfig(PretrainedConfig):
-    r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
-        `BertModel`.
-
-
-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-    """
-    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
-        super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
-
-
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except (ImportError, AttributeError) as e:
diff --git a/pytorch_transformers/modeling_distilbert.py b/pytorch_transformers/modeling_distilbert.py
index d9a2f1a1770f03..0e63a87a3e535c 100644
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -31,7 +31,9 @@
 import torch
 import torch.nn as nn
 
-from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_distilbert import DistilBertConfig
+from .file_utils import add_start_docstrings
 
 import logging
 logger = logging.getLogger(__name__)
@@ -42,69 +44,6 @@
     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
 }
 
-DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
-}
-
-
-class DistilBertConfig(PretrainedConfig):
-    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
-                 max_position_embeddings=512,
-                 sinusoidal_pos_embds=True,
-                 n_layers=6,
-                 n_heads=12,
-                 dim=768,
-                 hidden_dim=4*768,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 activation='gelu',
-                 initializer_range=0.02,
-                 tie_weights_=True,
-                 qa_dropout=0.1,
-                 seq_classif_dropout=0.2,
-                 **kwargs):
-        super(DistilBertConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.max_position_embeddings = max_position_embeddings
-            self.sinusoidal_pos_embds = sinusoidal_pos_embds
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dim = dim
-            self.hidden_dim = hidden_dim
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.activation = activation
-            self.initializer_range = initializer_range
-            self.tie_weights_ = tie_weights_
-            self.qa_dropout = qa_dropout
-            self.seq_classif_dropout = seq_classif_dropout
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-    @property
-    def hidden_size(self):
-        return self.dim
-
-    @property
-    def num_attention_heads(self):
-        return self.n_heads
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layers
-
 
 ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
 def gelu(x):
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 017ad4f7b47819..175289150d048b 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -30,19 +30,15 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
-                             add_start_docstrings)
-from .modeling_bert import BertLayerNorm as LayerNorm
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_gpt2 import GPT2Config
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -102,120 +98,6 @@ def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
-class GPT2Config(PretrainedConfig):
-    """Configuration class to store the configuration of a `GPT2Model`.
-
-    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        n_embd: Dimensionality of the embeddings and hidden states.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        resid_pdrop: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        attn_pdrop: The dropout ratio for the attention
-            probabilities.
-        embd_pdrop: The dropout ratio for the embeddings.
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-    """
-    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(
-        self,
-        vocab_size_or_config_json_file=50257,
-        n_positions=1024,
-        n_ctx=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-
-        num_labels=1,
-        summary_type='cls_index',
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        """Constructs GPT2Config.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
-        super(GPT2Config, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
-
-
-
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
@@ -332,9 +214,9 @@ class Block(nn.Module):
     def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
         nx = config.n_embd
-        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.attn = Attention(nx, n_ctx, config, scale)
-        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
     def forward(self, x, layer_past=None, head_mask=None):
@@ -370,7 +252,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, LayerNorm):
+        elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
@@ -458,7 +340,7 @@ def __init__(self, config):
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
-        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         self.init_weights()
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 8bf9d86696c154..05268d28b75ccd 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -30,15 +30,13 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
-                             add_start_docstrings)
-from .modeling_bert import BertLayerNorm as LayerNorm
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_openai import OpenAIGPTConfig
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
-OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -127,111 +125,6 @@ def swish(x):
 ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
 
 
-class OpenAIGPTConfig(PretrainedConfig):
-    """
-    Configuration class to store the configuration of a `OpenAIGPTModel`.
-
-    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
-        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        n_embd: Dimensionality of the embeddings and hidden states.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        afn: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        resid_pdrop: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        attn_pdrop: The dropout ratio for the attention
-            probabilities.
-        embd_pdrop: The dropout ratio for the embeddings.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
-    """
-    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(
-        self,
-        vocab_size_or_config_json_file=40478,
-        n_positions=512,
-        n_ctx=512,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        afn="gelu",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        predict_special_tokens=True,
-
-        num_labels=1,
-        summary_type='cls_index',
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        """Constructs OpenAIGPTConfig.
-        """
-        super(OpenAIGPTConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
-
-
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
@@ -342,9 +235,9 @@ def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
         nx = config.n_embd
         self.attn = Attention(nx, n_ctx, config, scale)
-        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
-        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
 
     def forward(self, x, head_mask=None):
         attn_outputs = self.attn(x, head_mask=head_mask)
@@ -376,7 +269,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, LayerNorm):
+        elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 6ae5cd1d44048f..cf1f86276bfbc6 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -22,14 +22,11 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
-                                                BertLayerNorm, BertModel,
-                                                BertPreTrainedModel, gelu)
-
-from pytorch_transformers.modeling_utils import add_start_docstrings
+from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
+from .configuration_roberta import RobertaConfig
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
@@ -39,13 +36,6 @@
     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
 }
 
-ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
-    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
-    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
-}
-
-
 class RobertaEmbeddings(BertEmbeddings):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
@@ -64,10 +54,6 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None):
         return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
 
 
-class RobertaConfig(BertConfig):
-    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-
 ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
     `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
     by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 9e2808f218f27c..9b8a98a839f704 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -34,18 +34,16 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling_bert import BertLayerNorm as LayerNorm
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
-TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
-}
 
 def build_tf_to_pytorch_map(model, config):
     """ A map of modules from TF to PyTorch.
@@ -175,143 +173,6 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
     return model
 
 
-class TransfoXLConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `TransfoXLModel`.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
-            cutoffs: cutoffs for the adaptive softmax
-            d_model: Dimensionality of the model's hidden states.
-            d_embed: Dimensionality of the embeddings
-            d_head: Dimensionality of the model's heads.
-            div_val: divident value for adapative input and softmax
-            pre_lnorm: apply LayerNorm to the input instead of the output
-            d_inner: Inner dimension in FF
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            tgt_len: number of tokens to predict
-            ext_len: length of the extended context
-            mem_len: length of the retained previous heads
-            same_length: use the same attn length for all tokens
-            proj_share_all_but_first: True to share all but first projs, False not to share.
-            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            clamp_len: use the same pos embeddings after clamp_len
-            sample_softmax: number of samples in sampled softmax
-            adaptive: use adaptive softmax
-            tie_weight: tie the word embedding and softmax weights
-            dropout: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            dropatt: The dropout ratio for the attention probabilities.
-            untie_r: untie relative position biases
-            embd_pdrop: The dropout ratio for the embeddings.
-            init: parameter initializer to use
-            init_range: parameters initialized by U(-init_range, init_range).
-            proj_init_std: parameters initialized by N(0, init_std)
-            init_std: parameters initialized by N(0, init_std)
-    """
-    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=267735,
-                 cutoffs=[20000, 40000, 200000],
-                 d_model=1024,
-                 d_embed=1024,
-                 n_head=16,
-                 d_head=64,
-                 d_inner=4096,
-                 div_val=4,
-                 pre_lnorm=False,
-                 n_layer=18,
-                 tgt_len=128,
-                 ext_len=0,
-                 mem_len=1600,
-                 clamp_len=1000,
-                 same_length=True,
-                 proj_share_all_but_first=True,
-                 attn_type=0,
-                 sample_softmax=-1,
-                 adaptive=True,
-                 tie_weight=True,
-                 dropout=0.1,
-                 dropatt=0.0,
-                 untie_r=True,
-                 init="normal",
-                 init_range=0.01,
-                 proj_init_std=0.01,
-                 init_std=0.02,
-                 **kwargs):
-        """Constructs TransfoXLConfig.
-        """
-        super(TransfoXLConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.cutoffs = []
-            self.cutoffs.extend(cutoffs)
-            self.tie_weight = tie_weight
-            if proj_share_all_but_first:
-                self.tie_projs = [False] + [True] * len(self.cutoffs)
-            else:
-                self.tie_projs = [False] + [False] * len(self.cutoffs)
-            self.d_model = d_model
-            self.d_embed = d_embed
-            self.d_head = d_head
-            self.d_inner = d_inner
-            self.div_val = div_val
-            self.pre_lnorm = pre_lnorm
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.tgt_len = tgt_len
-            self.ext_len = ext_len
-            self.mem_len = mem_len
-            self.same_length = same_length
-            self.attn_type = attn_type
-            self.clamp_len = clamp_len
-            self.sample_softmax = sample_softmax
-            self.adaptive = adaptive
-            self.dropout = dropout
-            self.dropatt = dropatt
-            self.untie_r = untie_r
-            self.init = init
-            self.init_range = init_range
-            self.proj_init_std = proj_init_std
-            self.init_std = init_std
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
-    @property
-    def max_position_embeddings(self):
-        return self.tgt_len + self.ext_len + self.mem_len
-
-    @property
-    def vocab_size(self):
-        return self.n_token
-
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
-
-    @property
-    def hidden_size(self):
-        return self.d_model
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
-
-
 class PositionalEmbedding(nn.Module):
     def __init__(self, demb):
         super(PositionalEmbedding, self).__init__()
@@ -347,7 +208,7 @@ def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
             nn.Dropout(dropout),
         )
 
-        self.layer_norm = LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
 
         self.pre_lnorm = pre_lnorm
 
@@ -387,7 +248,7 @@ def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
         self.dropatt = nn.Dropout(dropatt)
         self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
 
-        self.layer_norm = LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
 
         self.scale = 1 / (d_head ** 0.5)
 
@@ -477,7 +338,7 @@ def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
         self.dropatt = nn.Dropout(dropatt)
         self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
 
-        self.layer_norm = LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
 
         self.scale = 1 / (d_head ** 0.5)
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index d482addb83d39b..48420f6d07ec85 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -30,11 +30,11 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
+from .configuration_utils import PretrainedConfig
 from .file_utils import cached_path
 
 logger = logging.getLogger(__name__)
 
-CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
 TF_WEIGHTS_NAME = 'model.ckpt'
 
@@ -52,209 +52,6 @@ def __init__(self, *args, **kwargs):
         def forward(self, input):
             return input
 
-
-if not six.PY2:
-    def add_start_docstrings(*docstr):
-        def docstring_decorator(fn):
-            fn.__doc__ = ''.join(docstr) + fn.__doc__
-            return fn
-        return docstring_decorator
-
-    def add_end_docstrings(*docstr):
-        def docstring_decorator(fn):
-            fn.__doc__ = fn.__doc__ + ''.join(docstr)
-            return fn
-        return docstring_decorator
-else:
-    # Not possible to update class docstrings on python2
-    def add_start_docstrings(*docstr):
-        def docstring_decorator(fn):
-            return fn
-        return docstring_decorator
-
-    def add_end_docstrings(*docstr):
-        def docstring_decorator(fn):
-            return fn
-        return docstring_decorator
-
-
-class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
-
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
-
-        Class attributes (overridden by derived classes):
-            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
-
-        Parameters:
-            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
-            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
-            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
-            ``torchscript``: string, default `False`. Is the model used with Torchscript.
-    """
-    pretrained_config_archive_map = {}
-
-    def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
-        self.output_attentions = kwargs.pop('output_attentions', False)
-        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
-        self.torchscript = kwargs.pop('torchscript', False)
-        self.pruned_heads = kwargs.pop('pruned_heads', {})
-
-    def save_pretrained(self, save_directory):
-        """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
-        """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
-
-        Parameters:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
-
-                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            return_unused_kwargs: (`optional`) bool:
-
-                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-
-        Examples::
-
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        cache_dir = kwargs.pop('cache_dir', None)
-        force_download = kwargs.pop('force_download', False)
-        proxies = kwargs.pop('proxies', None)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
-
-        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        else:
-            config_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
-            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file))
-            raise e
-        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-
-        # Load config
-        config = cls.from_json_file(resolved_config_file)
-
-        if hasattr(config, 'pruned_heads'):
-            config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config %s", config)
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
 class PreTrainedModel(nn.Module):
     r""" Base class for all models.
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 847a3ecfb7d75d..80c32cbf419d57 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -16,11 +16,8 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
 import math
-import sys
-from io import open
 
 import itertools
 import numpy as np
@@ -30,8 +27,9 @@
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings,
-                             prune_linear_layer, SequenceSummary, SQuADHead)
+from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead
+from .configuration_xlm import XLMConfig
+from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
@@ -47,164 +45,6 @@
     'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
     'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
 }
-XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
-    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
-    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
-    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
-    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
-}
-
-
-class XLMConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `XLMModel`.
-
-    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
-        d_model: Size of the encoder layers and the pooler layer.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        untie_r: untie relative position biases
-        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
-
-        dropout: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        dropatt: The dropout ratio for the attention
-            probabilities.
-        max_position_embeddings: The maximum sequence length that this model might
-            ever be used with. Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_eps: The epsilon used by LayerNorm.
-
-        dropout: float, dropout rate.
-        dropatt: float, dropout rate on attention probabilities.
-        init: str, the initialization scheme, either "normal" or "uniform".
-        init_range: float, initialize the parameters with a uniform distribution
-            in [-init_range, init_range]. Only effective when init="uniform".
-        init_std: float, initialize the parameters with a normal distribution
-            with mean 0 and stddev init_std. Only effective when init="normal".
-        mem_len: int, the number of tokens to cache.
-        reuse_len: int, the number of tokens in the currect batch to be cached
-            and reused in the future.
-        bi_data: bool, whether to use bidirectional input pipeline.
-            Usually set to True during pretraining and False during finetuning.
-        clamp_len: int, clamp all relative distances larger than clamp_len.
-            -1 means no clamping.
-        same_length: bool, whether to use the same attention length for each token.
-    """
-    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=30145,
-                 emb_dim=2048,
-                 n_layers=12,
-                 n_heads=16,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 gelu_activation=True,
-                 sinusoidal_embeddings=False,
-                 causal=False,
-                 asm=False,
-                 n_langs=1,
-                 use_lang_emb=True,
-                 max_position_embeddings=512,
-                 embed_init_std=2048 ** -0.5,
-                 layer_norm_eps=1e-12,
-                 init_std=0.02,
-                 bos_index=0,
-                 eos_index=1,
-                 pad_index=2,
-                 unk_index=3,
-                 mask_index=5,
-                 is_encoder=True,
-
-                 finetuning_task=None,
-                 num_labels=2,
-                 summary_type='first',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 **kwargs):
-        """Constructs XLMConfig.
-        """
-        super(XLMConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_words = vocab_size_or_config_json_file
-            self.emb_dim = emb_dim
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.causal = causal
-            self.asm = asm
-            self.n_langs = n_langs
-            self.use_lang_emb = use_lang_emb
-            self.layer_norm_eps = layer_norm_eps
-            self.bos_index = bos_index
-            self.eos_index = eos_index
-            self.pad_index = pad_index
-            self.unk_index = unk_index
-            self.mask_index = mask_index
-            self.is_encoder = is_encoder
-            self.max_position_embeddings = max_position_embeddings
-            self.embed_init_std = embed_init_std
-            self.init_std = init_std
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_proj_to_labels = summary_proj_to_labels
-            self.summary_first_dropout = summary_first_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
-    @property
-    def vocab_size(self):
-        return self.n_words
-
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_words = value
-
-    @property
-    def hidden_size(self):
-        return self.emb_dim
-
-    @property
-    def num_attention_heads(self):
-        return self.n_heads
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layers
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 516e87e99bad2b..3ac791b12194e1 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -29,9 +29,9 @@
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits,
-                             add_start_docstrings)
+from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
+from .configuration_xlnet import XLNetConfig
+from .file_utils import add_start_docstrings
 
 
 logger = logging.getLogger(__name__)
@@ -40,10 +40,6 @@
     'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
-XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
-    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
-}
 
 
 def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
@@ -192,147 +188,6 @@ def swish(x):
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
-class XLNetConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a ``XLNetModel``.
-
-    Args:
-        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
-        d_model: Size of the encoder layers and the pooler layer.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        untie_r: untie relative position biases
-        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
-
-        dropout: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        dropatt: The dropout ratio for the attention
-            probabilities.
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_eps: The epsilon used by LayerNorm.
-
-        dropout: float, dropout rate.
-        dropatt: float, dropout rate on attention probabilities.
-        init: str, the initialization scheme, either "normal" or "uniform".
-        init_range: float, initialize the parameters with a uniform distribution
-            in [-init_range, init_range]. Only effective when init="uniform".
-        init_std: float, initialize the parameters with a normal distribution
-            with mean 0 and stddev init_std. Only effective when init="normal".
-        mem_len: int, the number of tokens to cache.
-        reuse_len: int, the number of tokens in the currect batch to be cached
-            and reused in the future.
-        bi_data: bool, whether to use bidirectional input pipeline.
-            Usually set to True during pretraining and False during finetuning.
-        clamp_len: int, clamp all relative distances larger than clamp_len.
-            -1 means no clamping.
-        same_length: bool, whether to use the same attention length for each token.
-        finetuning_task: name of the glue task on which the model was fine-tuned if any
-    """
-    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=32000,
-                 d_model=1024,
-                 n_layer=24,
-                 n_head=16,
-                 d_inner=4096,
-                 ff_activation="gelu",
-                 untie_r=True,
-                 attn_type="bi",
-
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-
-                 dropout=0.1,
-                 mem_len=None,
-                 reuse_len=None,
-                 bi_data=False,
-                 clamp_len=-1,
-                 same_length=False,
-
-                 finetuning_task=None,
-                 num_labels=2,
-                 summary_type='last',
-                 summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_last_dropout=0.1,
-                 start_n_top=5,
-                 end_n_top=5,
-                 **kwargs):
-        """Constructs XLNetConfig.
-        """
-        super(XLNetConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.d_model = d_model
-            self.n_layer = n_layer
-            self.n_head = n_head
-            assert d_model % n_head == 0
-            self.d_head = d_model // n_head
-            self.ff_activation = ff_activation
-            self.d_inner = d_inner
-            self.untie_r = untie_r
-            self.attn_type = attn_type
-
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-
-            self.dropout = dropout
-            self.mem_len = mem_len
-            self.reuse_len = reuse_len
-            self.bi_data = bi_data
-            self.clamp_len = clamp_len
-            self.same_length = same_length
-
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_last_dropout = summary_last_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
-    @property
-    def max_position_embeddings(self):
-        return -1
-
-    @property
-    def vocab_size(self):
-        return self.n_token
-
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
-
-    @property
-    def hidden_size(self):
-        return self.d_model
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
-
-
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
 except (ImportError, AttributeError) as e:
diff --git a/pytorch_transformers/tests/configuration_common_test.py b/pytorch_transformers/tests/configuration_common_test.py
new file mode 100644
index 00000000000000..8ee751153c1721
--- /dev/null
+++ b/pytorch_transformers/tests/configuration_common_test.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import shutil
+import json
+import random
+import uuid
+
+import unittest
+import logging
+
+
+class ConfigTester(object):
+    def __init__(self, parent, config_class=None, **kwargs):
+        self.parent = parent
+        self.config_class = config_class
+        self.inputs_dict = kwargs
+
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'vocab_size'))
+        self.parent.assertTrue(hasattr(config, 'hidden_size'))
+        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
+        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+
+    def create_and_test_config_to_json_string(self):
+        config = self.config_class(**self.inputs_dict)
+        obj = json.loads(config.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.parent.assertEqual(obj[key], value)
+
+    def create_and_test_config_to_json_file(self):
+        config_first = self.config_class(**self.inputs_dict)
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
+        config_first.to_json_file(json_file_path)
+        config_second = self.config_class.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def run_common_tests(self):
+        self.create_and_test_config_common_properties()
+        self.create_and_test_config_to_json_string()
+        self.create_and_test_config_to_json_file()
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
index 09d09b28fc287e..dfdedbbe6129ba 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -28,7 +28,8 @@
                                   AutoModelForQuestionAnswering, BertForQuestionAnswering)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class AutoModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index ac5d2636a9cf39..39d169972d5320 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,8 @@
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class BertModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 8b1a70fcf3c372..c50d6678d8edee 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -28,9 +28,9 @@
 
 import torch
 
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from pytorch_transformers.modeling_gpt2 import GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
+                                  BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 
 
 def _config_zero_init(config):
diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py
index 1c9d9c792dd6e5..a7457a42c5c73a 100644
--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -18,13 +18,15 @@
 
 import unittest
 import shutil
+import sys
 import pytest
 
 from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                   DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
 from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 1786ada54cdb9a..2717805120eaff 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -24,7 +24,8 @@
 from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                   GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index 0fcb4b7d64ded6..dbef6c52eb8e1c 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,8 @@
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                   OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index 94035e9667bca5..0471505b5e7f12 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -24,7 +24,8 @@
 from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
 from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class RobertaModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index e3c0fbcdf0c0f9..a060432cc49385 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,8 @@
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 4308c18d45bb60..dcd09634770be8 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,8 @@
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 
 class XLMModelTest(CommonTestCases.CommonModelTester):
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index 290c5766e2fc12..4445bc17ac4e69 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,8 @@
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
 
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 

From d77abd4d08fa4b38b7d3fc7dfe970875eeaaa870 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Sep 2019 00:41:24 +0200
Subject: [PATCH 096/119] clean ups

---
 pytorch_transformers/tests/modeling_distilbert_test.py | 4 ----
 pytorch_transformers/tests/modeling_transfo_xl_test.py | 2 --
 2 files changed, 6 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py
index a7457a42c5c73a..c1503b4355a01e 100644
--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -17,13 +17,9 @@
 from __future__ import print_function
 
 import unittest
-import shutil
-import sys
-import pytest
 
 from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                   DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index a060432cc49385..f482c47202245c 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -16,9 +16,7 @@
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
 import random
 import shutil
 import pytest

From 121f88cae3f5fc1dc4fe790dc1c258eb08b4b254 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Sep 2019 02:17:50 +0200
Subject: [PATCH 097/119] update conversion scripts

---
 pytorch_transformers/__init__.py                      | 11 +++++++++++
 .../convert_gpt2_checkpoint_to_pytorch.py             |  2 +-
 .../convert_openai_checkpoint_to_pytorch.py           |  2 +-
 .../convert_pytorch_checkpoint_to_tf.py               |  2 +-
 .../convert_roberta_checkpoint_to_pytorch.py          |  4 ++--
 .../convert_tf_checkpoint_to_pytorch.py               |  2 +-
 .../convert_transfo_xl_checkpoint_to_pytorch.py       |  2 +-
 .../convert_xlm_checkpoint_to_pytorch.py              |  2 +-
 .../convert_xlnet_checkpoint_to_pytorch.py            |  2 +-
 9 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 04a73c3abca7bd..b851c99c9de86c 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,15 @@
 __version__ = "1.2.0"
+# Work around to update TensorFlow's absl.logging threshold which alters the
+# default Python logging output behavior when present.
+# see: https://github.com/abseil/abseil-py/issues/99
+# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
+try:
+    import absl.logging
+    absl.logging.set_verbosity('info')
+    absl.logging.set_stderrthreshold('info')
+    absl.logging._warn_preinit_stderr = False
+except:
+    pass
 
 # Tokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index e9bfa0302ad717..eb5b3009b4ce63 100755
--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@
 
 import torch
 
-from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 3009f8a99e18e0..5eecdd9648c2ff 100755
--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@
 
 import torch
 
-from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index 4857ea8d808926..15fd6bf5acfc0c 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -20,7 +20,7 @@
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers.modeling import BertModel
+from pytorch_transformers import BertModel
 
 
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index 743013e4c49707..9f74254daa8854 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder,
+from pytorch_transformers import (BertConfig, BertEncoder,
                                                 BertIntermediate, BertLayer,
                                                 BertModel, BertOutput,
                                                 BertSelfAttention,
                                                 BertSelfOutput)
-from pytorch_transformers.modeling_roberta import (RobertaEmbeddings,
+from pytorch_transformers import (RobertaEmbeddings,
                                                    RobertaForMaskedLM,
                                                    RobertaForSequenceClassification,
                                                    RobertaModel)
diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 220204f36ea306..d382d3588e2c47 100755
--- a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@
 import argparse
 import torch
 
-from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 7e79d58d7dafb1..b310b73453c9c6 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -26,7 +26,7 @@
 import pytorch_transformers.tokenization_transfo_xl as data_utils
 
 from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel,
+from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                       load_tf_weights_in_transfo_xl)
 from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index bf4b99b6eacc84..d6a3cd89e7efc2 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,7 +23,7 @@
 import torch
 import numpy
 
-from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
 from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 import logging
diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index 038c70696043f1..a36fa514b59b7f 100755
--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@
 import argparse
 import torch
 
-from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,

From 85df4f7cca89f5ff8db493f045bede7c3c8a7d11 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Sep 2019 02:34:09 +0200
Subject: [PATCH 098/119] also gathering file names in file_utils

---
 pytorch_transformers/__init__.py            | 8 +++++---
 pytorch_transformers/configuration_utils.py | 4 +---
 pytorch_transformers/file_utils.py          | 4 ++++
 pytorch_transformers/modeling_utils.py      | 5 +----
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index b851c99c9de86c..24ff52e5d4d1ec 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -24,7 +24,7 @@
 from .tokenization_distilbert import DistilBertTokenizer
 
 # Configurations
-from .configuration_utils import CONFIG_NAME, PretrainedConfig
+from .configuration_utils import PretrainedConfig
 from .configuration_auto import AutoConfig
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -36,7 +36,7 @@
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 # Modeling
-from .modeling_utils import (WEIGHTS_NAME, TF_WEIGHTS_NAME, PreTrainedModel, prune_layer, Conv1D)
+from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
 from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                             AutoModelWithLMHead)
 
@@ -70,4 +70,6 @@
                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 
 # Files and general utilities
-from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path, add_start_docstrings, add_end_docstrings)
+from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+                         cached_path, add_start_docstrings, add_end_docstrings,
+                         WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py
index 550b47fab84cde..7efc735d413212 100644
--- a/pytorch_transformers/configuration_utils.py
+++ b/pytorch_transformers/configuration_utils.py
@@ -24,12 +24,10 @@
 import os
 from io import open
 
-from .file_utils import cached_path
+from .file_utils import cached_path, CONFIG_NAME
 
 logger = logging.getLogger(__name__)
 
-CONFIG_NAME = "config.json"
-
 class PretrainedConfig(object):
     r""" Base class for all configuration classes.
         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index 37ebc57fb3cd7c..3fe7fa891def43 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -48,6 +48,10 @@
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
+WEIGHTS_NAME = "pytorch_model.bin"
+TF_WEIGHTS_NAME = 'model.ckpt'
+CONFIG_NAME = "config.json"
+
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 if not six.PY2:
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 48420f6d07ec85..2fb4671674a2c7 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -31,13 +31,10 @@
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
 
 logger = logging.getLogger(__name__)
 
-WEIGHTS_NAME = "pytorch_model.bin"
-TF_WEIGHTS_NAME = 'model.ckpt'
-
 
 try:
     from torch.nn import Identity

From d737947725afd06fc3fd0e57939266eba2709bfd Mon Sep 17 00:00:00 2001
From: maru0kun <53220859+maru0kun@users.noreply.github.com>
Date: Thu, 5 Sep 2019 19:24:57 +0900
Subject: [PATCH 099/119] Fix typo

---
 pytorch_transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 53b8d245b86f87..9bb69eb703d6b9 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -174,7 +174,7 @@ def eos_token_id(self):
         return self.convert_tokens_to_ids(self._eos_token)
 
     @property
-    def unk_token_is(self):
+    def unk_token_id(self):
         """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
         if self._unk_token is None:
             logger.error("Using unk_token, but it is not set yet.")

From f7ee2e5d20d4de18400a353443a172782981e956 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 3 Sep 2019 10:29:41 -0400
Subject: [PATCH 100/119] [README] link to Write With Transformer

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index c473111f2b11b4..e40852cd3e7158 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ These implementations have been tested on several datasets (see the example scri
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
+| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
@@ -68,6 +69,14 @@ It contains an example of a conversion script from a Pytorch trained Transformer
 At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
 or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
 
+## Online demo
+
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
+You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
+
+> “🦄 Write with transformer is to writing what calculators are to calculus.”
+
+![write_with_transformer](https://transformer.huggingface.co/assets/thumbnail-large.png)
 
 ## Quick tour
 

From f9453d15e5342bb5a60498916635f1a5e5c6cb14 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 5 Sep 2019 12:35:22 -0400
Subject: [PATCH 101/119] Fix broken link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e40852cd3e7158..09523401ef6132 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
 
 > “🦄 Write with transformer is to writing what calculators are to calculus.”
 
-![write_with_transformer](https://transformer.huggingface.co/assets/thumbnail-large.png)
+![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)
 
 ## Quick tour
 

From dddd6b99272652f58b3d9f3a29d5c17cc4e119c9 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 5 Sep 2019 18:26:14 +0000
Subject: [PATCH 102/119] Update DistilBERT training code

---
 examples/distillation/README.md        | 6 ++++++
 examples/distillation/distiller.py     | 9 ++++++---
 examples/distillation/requirements.txt | 3 +++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index bb919385f171d9..73e0cc0655903a 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -9,6 +9,12 @@ DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and l
 For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
 ).
 
+## Setup
+
+This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
+
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+
 ## How to use DistilBERT
 
 PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 38769c4b0ec262..ed710a2bee6c7d 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -17,6 +17,7 @@
 """
 import os
 import math
+import psutil
 from tensorboardX import SummaryWriter
 from tqdm import trange, tqdm
 import numpy as np
@@ -192,7 +193,7 @@ def prepare_batch(self,
         x_prob = self.token_probs[token_ids.flatten()]
         n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
         tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device)
+        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
         pred_mask[tgt_ids] = 1
         pred_mask = pred_mask.view(bs, max_seq_len)
 
@@ -216,7 +217,7 @@ def prepare_batch(self,
         _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
         token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
 
-        mlm_labels[1-pred_mask] = -1
+        mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
 
         return token_ids, attn_mask, mlm_labels
 
@@ -379,9 +380,9 @@ def optimize(self,
                 torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
             else:
                 torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
-            self.scheduler.step()
             self.optimizer.step()
             self.optimizer.zero_grad()
+            self.scheduler.step()
 
     def iter(self):
         """
@@ -418,6 +419,8 @@ def log_tensorboard(self):
         if self.alpha_mse > 0.:
             self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
         self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
+        
+        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
 
     def end_epoch(self):
         """
diff --git a/examples/distillation/requirements.txt b/examples/distillation/requirements.txt
index efb369dc4381c5..18146239eb481d 100644
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -1 +1,4 @@
 gitpython==3.0.2
+tensorboard>=1.14.0
+tensorboardX==1.8
+psutil==5.6.3

From 04b50cabf601b1606d0ffd15c47241f9056520c3 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 5 Sep 2019 18:49:28 +0000
Subject: [PATCH 103/119] gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index bbc738b931d34f..d285d0ded932b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,5 @@ runs
 examples/runs
 
 # data
-data
\ No newline at end of file
+data
+serialization_dir
\ No newline at end of file

From 5c6cac102b3347960684356d253bb97b4ef2da75 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Sep 2019 21:31:29 +0200
Subject: [PATCH 104/119] adding test for common properties and cleaning up a
 bit base class

---
 .../tests/tokenization_tests_commons.py       | 16 ++++++++
 pytorch_transformers/tokenization_utils.py    | 38 +++++++------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 65f45c496c26aa..3da0494ac44fbc 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -55,6 +55,22 @@ def get_tokenizer(self, **kwargs):
         def get_input_output_texts(self):
             raise NotImplementedError
 
+        def test_tokenizers_common_properties(self):
+            tokenizer = self.get_tokenizer()
+            attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
+                                "pad_token", "cls_token", "mask_token"]
+            for attr in attributes_list:
+                self.assertTrue(hasattr(tokenizer, attr))
+                self.assertTrue(hasattr(tokenizer, attr + "_id"))
+
+            self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
+            self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
+
+            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
+                                "added_tokens_decoder"]
+            for attr in attributes_list:
+                self.assertTrue(hasattr(tokenizer, attr))
+
         def test_save_and_load_tokenizer(self):
             # safety check on max_len default value so we are sure the test works
             tokenizer = self.get_tokenizer()
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 9bb69eb703d6b9..1e2cd59648d764 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -162,58 +162,42 @@ def additional_special_tokens(self, value):
     @property
     def bos_token_id(self):
         """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
-        if self._bos_token is None:
-            logger.error("Using bos_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._bos_token)
+        return self.convert_tokens_to_ids(self.bos_token)
 
     @property
     def eos_token_id(self):
         """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
-        if self._eos_token is None:
-            logger.error("Using eos_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._eos_token)
+        return self.convert_tokens_to_ids(self.eos_token)
 
     @property
     def unk_token_id(self):
         """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
-        if self._unk_token is None:
-            logger.error("Using unk_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._unk_token)
+        return self.convert_tokens_to_ids(self.unk_token)
 
     @property
     def sep_token_id(self):
         """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        if self._sep_token is None:
-            logger.error("Using sep_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._sep_token)
+        return self.convert_tokens_to_ids(self.sep_token)
 
     @property
     def pad_token_id(self):
         """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
-        if self._pad_token is None:
-            logger.error("Using pad_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._pad_token)
+        return self.convert_tokens_to_ids(self.pad_token)
 
     @property
     def cls_token_id(self):
         """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        if self._cls_token is None:
-            logger.error("Using cls_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._cls_token)
+        return self.convert_tokens_to_ids(self.cls_token)
 
     @property
     def mask_token_id(self):
         """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        if self._mask_token is None:
-            logger.error("Using mask_token, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._mask_token)
+        return self.convert_tokens_to_ids(self.mask_token)
 
     @property
     def additional_special_tokens_ids(self):
         """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
-        if self._additional_special_tokens is None:
-            logger.error("Using additional_special_tokens, but it is not set yet.")
-        return self.convert_tokens_to_ids(self._additional_special_tokens)
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
 
     def __init__(self, max_len=None, **kwargs):
         self._bos_token = None
@@ -653,6 +637,9 @@ def convert_tokens_to_ids(self, tokens):
         """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
             (resp. a sequence of ids), using the vocabulary.
         """
+        if tokens is None:
+            return None
+
         if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
             return self._convert_token_to_id_with_added_voc(tokens)
 
@@ -666,6 +653,9 @@ def convert_tokens_to_ids(self, tokens):
         return ids
 
     def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
         if token in self.added_tokens_encoder:
             return self.added_tokens_encoder[token]
         return self._convert_token_to_id(token)

From 593c0704351f35208e44dae1d85be8238209eb2a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 6 Sep 2019 12:00:12 -0400
Subject: [PATCH 105/119] Better examples

---
 docs/requirements.txt         |   1 +
 docs/source/conf.py           |   3 +-
 docs/source/examples.rst      | 686 ----------------------------------
 examples/README.md            | 338 +++++++++++++++++
 examples/run_lm_finetuning.py |   2 +-
 5 files changed, 342 insertions(+), 688 deletions(-)
 delete mode 100644 docs/source/examples.rst
 create mode 100644 examples/README.md

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 112beb3f72bc6e..0c2a31c09acf3e 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -26,3 +26,4 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 urllib3==1.25.3
+sphinx-markdown-tables==0.0.9
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index cdca1d82d0c39c..c847dee806640c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,7 +43,8 @@
     'sphinx.ext.coverage',
     'sphinx.ext.napoleon',
     'recommonmark',
-    'sphinx.ext.viewcode'
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables'
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
deleted file mode 100644
index d9784514382c83..00000000000000
--- a/docs/source/examples.rst
+++ /dev/null
@@ -1,686 +0,0 @@
-examples.rst
-
-Examples
-================================================
-
-.. list-table::
-   :header-rows: 1
-
-   * - Sub-section
-     - Description
-   * - `Training large models: introduction, tools and examples <#introduction>`_
-     - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
-   * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
-     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_
-     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
-     - How to fine tune ``BERT large``
-
-
-.. _introduction:
-
-Training large models: introduction, tools and examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
-
-To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this year.
-
-Here is how to use these techniques in our scripts:
-
-
-* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
-* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
-* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`__ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
-
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
-
-Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node=4 \
-        --nnodes=2 \
-        --node_rank=$THIS_MACHINE_INDEX \
-        --master_addr="192.168.1.1" \
-        --master_port=1234 run_bert_classifier.py \
-        (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
-
-Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
-
-.. _fine-tuning-bert-examples:
-
-Fine-tuning with BERT: running the examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We showcase several fine-tuning examples based on (and extended from) `the original implementation <https://github.com/google-research/bert/>`_\ :
-
-
-* a *sequence-level classifier* on nine different GLUE tasks,
-* a *token-level classifier* on the question answering dataset SQuAD, and
-* a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
-* a *BERT language model* on another target corpus
-
-GLUE results on dev set
-~~~~~~~~~~~~~~~~~~~~~~~
-
-We get the following results on the dev set of GLUE benchmark with an uncased BERT base
-model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of 
-these tasks have a small dataset and training can lead to high variance in the results between different runs.
-We report the median on 5 runs (with different seeds) for each of the metrics.
-
-.. list-table::
-   :header-rows: 1
-
-   * - Task
-     - Metric
-     - Result
-   * - CoLA
-     - Matthew's corr.
-     - 55.75
-   * - SST-2
-     - accuracy
-     - 92.09
-   * - MRPC
-     - F1/accuracy
-     - 90.48/86.27
-   * - STS-B
-     - Pearson/Spearman corr.
-     - 89.03/88.64
-   * - QQP
-     - accuracy/F1
-     - 90.92/87.72
-   * - MNLI
-     - matched acc./mismatched acc.
-     - 83.74/84.06
-   * - QNLI
-     - accuracy
-     - 91.07
-   * - RTE
-     - accuracy
-     - 68.59
-   * - WNLI
-     - accuracy
-     - 43.66
-
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 <https://gluebenchmark.com/faq>`_ on the webite.
-
-Before running anyone of these GLUE tasks you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-   export TASK_NAME=MRPC
-
-   python run_bert_classifier.py \
-     --task_name $TASK_NAME \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/$TASK_NAME \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/$TASK_NAME/
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
-
-MRPC
-~~~~
-
-This example code fine-tunes BERT on the Microsoft Research Paraphrase
-Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running this example you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_bert_classifier.py \
-     --task_name MRPC \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/MRPC/ \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/mrpc_output/
-
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`__ gave evaluation results between 84% and 88%.
-
-**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
-First install apex as indicated `here <https://github.com/NVIDIA/apex>`__.
-Then run
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_bert_classifier.py \
-     --task_name MRPC \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/MRPC/ \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/mrpc_output/ \
-     --fp16
-
-**Distributed training**
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name MRPC \
-        --do_train \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $GLUE_DIR/MRPC/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-         --output_dir /tmp/mrpc_output/
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-     acc = 0.8823529411764706
-     acc_and_f1 = 0.901702786377709
-     eval_loss = 0.3418912578906332
-     f1 = 0.9210526315789473
-     global_step = 174
-     loss = 0.07231863956341798
-
-Here is an example on MNLI:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name mnli \
-        --do_train \
-        --do_eval \
-        --do_lower_case \
-        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
-        --overwrite_output_dir
-
-.. code-block:: bash
-
-   ***** Eval results *****
-     acc = 0.8679706601466992
-     eval_loss = 0.4911287787382479
-     global_step = 18408
-     loss = 0.04755385363816904
-
-   ***** Eval results *****
-     acc = 0.8747965825874695
-     eval_loss = 0.45516540421714036
-     global_step = 18408
-     loss = 0.04755385363816904
-
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
-
-SQuAD
-~~~~~
-
-This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
-
-The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory.
-
-
-* `train-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json>`_
-* `dev-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json>`_
-* `evaluate-v1.1.py <https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py>`_
-
-.. code-block:: shell
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python run_bert_squad.py \
-     --bert_model bert-base-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --train_batch_size 12 \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2.0 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/
-
-Training with the previous hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
-   {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
-
-**distributed training**
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node=8 \
-    run_bert_squad.py \
-    --bert_model bert-large-uncased-whole-word-masking  \
-    --do_train \
-    --do_predict \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --train_batch_size 24 \
-    --gradient_accumulation_steps 12
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-   {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-
-This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``.
-
-And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ :
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py \
-        --bert_model bert-large-cased-whole-word-masking \
-        --do_train \
-        --do_predict \
-        --do_lower_case \
-        --train_file $SQUAD_DIR/train-v1.1.json \
-        --predict_file $SQUAD_DIR/dev-v1.1.json \
-        --learning_rate 3e-5 \
-        --num_train_epochs 2 \
-        --max_seq_length 384 \
-        --doc_stride 128 \
-        --output_dir ../models/wwm_cased_finetuned_squad/ \
-        --train_batch_size 24 \
-        --gradient_accumulation_steps 12
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-   {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
-
-SWAG
-~~~~
-
-The data for SWAG can be downloaded by cloning the following `repository <https://github.com/rowanz/swagaf>`_
-
-.. code-block:: shell
-
-   export SWAG_DIR=/path/to/SWAG
-
-   python run_bert_swag.py \
-     --bert_model bert-base-uncased \
-     --do_train \
-     --do_lower_case \
-     --do_eval \
-     --data_dir $SWAG_DIR/data \
-     --train_batch_size 16 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --max_seq_length 80 \
-     --output_dir /tmp/swag_output/ \
-     --gradient_accumulation_steps 4
-
-Training with the previous hyper-parameters on a single GPU gave us the following results:
-
-.. code-block::
-
-   eval_accuracy = 0.8062081375587323
-   eval_loss = 0.5966546792367169
-   global_step = 13788
-   loss = 0.06423990014260186
-
-LM Fine-tuning
-~~~~~~~~~~~~~~
-
-The data should be a text file in the same format as `sample_text.txt <./pytorch_transformers/tests/fixtures/sample_text.txt/sample_text.txt>`_  (one sentence per line, docs separated by empty line).
-You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
-Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
-
-Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/README.md>`_ of the `examples/lm_finetuning/ <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/>`_ folder.
-
-.. _fine-tuning:
-
-OpenAI GPT, Transformer-XL and GPT-2: running the examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations:
-
-
-* fine-tuning OpenAI GPT on the ROCStories dataset
-* evaluating Transformer-XL on Wikitext 103
-* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
-* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task
-
-Fine-tuning OpenAI GPT on the RocStories dataset
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code fine-tunes OpenAI GPT on the RocStories dataset.
-
-Before running this example you should download the
-`RocStories dataset <https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories>`_ and unpack it to some directory ``$ROC_STORIES_DIR``.
-
-.. code-block:: shell
-
-   export ROC_STORIES_DIR=/path/to/RocStories
-
-   python run_openai_gpt.py \
-     --model_name openai-gpt \
-     --do_train \
-     --do_eval \
-     --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
-     --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
-     --output_dir ../log \
-     --train_batch_size 16 \
-
-This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
-
-Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
-This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
-
-.. code-block:: shell
-
-   python run_transfo_xl.py --work_dir ../log
-
-This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
-
-Unconditional and conditional generation from OpenAI's GPT-2 model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code is identical to the original unconditional and conditional generation codes.
-
-Conditional generation:
-
-.. code-block:: shell
-
-   python run_gpt2.py
-
-Unconditional generation:
-
-.. code-block:: shell
-
-   python run_gpt2.py --unconditional
-
-The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI.
-
-
-Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
-The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
-
-This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
-
-.. code-block:: bash
-
-
-    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
-
-    python run_lm_finetuning.py
-        --output_dir=output
-        --model_type=gpt2
-        --model_name_or_path=gpt2
-        --do_train
-        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
-        --do_eval
-        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
-
-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run.
-It reaches a score of about 20 perplexity once fine-tuned on the dataset.
-
-This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity).
-The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
-
-.. code-block:: bash
-
-
-    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
-
-    python run_lm_finetuning.py
-        --output_dir=output
-        --model_type=roberta
-        --model_name_or_path=roberta-base
-        --do_train
-        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
-        --do_eval
-        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
-        --mlm
-
-.. _fine-tuning-BERT-large:
-
-Fine-tuning BERT-large on GPUs
-------------------------------
-
-The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
-
-For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
-
-.. code-block:: bash
-
-   {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
-
-To get these results we used a combination of:
-
-
-* multi-GPU training (automatically activated on a multi-GPU server),
-* 2 steps of gradient accumulation and
-* perform the optimization step on CPU to store Adam's averages in RAM.
-
-Here is the full list of hyper-parameters for this run:
-
-.. code-block:: bash
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python ./run_bert_squad.py \
-     --bert_model bert-large-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --gradient_accumulation_steps 2
-
-If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
-
-Here is an example of hyper-parameters for a FP16 run we tried:
-
-.. code-block:: bash
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python ./run_bert_squad.py \
-     --bert_model bert-large-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --fp16 \
-     --loss_scale 128
-
-The results were similar to the above FP32 results (actually slightly higher):
-
-.. code-block:: bash
-
-   {"exact_match": 84.65468306527909, "f1": 91.238669287002}
-
-Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ :
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node=8 \
-     run_bert_squad.py \
-     --bert_model bert-large-uncased-whole-word-masking \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --gradient_accumulation_steps 2
-
-Fine-tuning XLNet
------------------
-
-STS-B
-~~~~~
-
-This example code fine-tunes XLNet on the STS-B corpus.
-
-Before running this example you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_xlnet_classifier.py \
-    --task_name STS-B \
-    --do_train \
-    --do_eval \
-    --data_dir $GLUE_DIR/STS-B/ \
-    --max_seq_length 128 \
-    --train_batch_size 8 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`__ gave evaluation results between 84% and 88%.
-
-**Distributed training**
-Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node 8 \
-    run_xlnet_classifier.py \
-    --task_name STS-B \
-    --do_train \
-    --do_eval \
-    --data_dir $GLUE_DIR/STS-B/ \
-    --max_seq_length 128 \
-    --train_batch_size 8 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-     acc = 0.8823529411764706
-     acc_and_f1 = 0.901702786377709
-     eval_loss = 0.3418912578906332
-     f1 = 0.9210526315789473
-     global_step = 174
-     loss = 0.07231863956341798
-
-Here is an example on MNLI:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name mnli \
-        --do_train \
-        --do_eval \
-        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
-        --overwrite_output_dir
-
-.. code-block:: bash
-
-   ***** Eval results *****
-     acc = 0.8679706601466992
-     eval_loss = 0.4911287787382479
-     global_step = 18408
-     loss = 0.04755385363816904
-
-   ***** Eval results *****
-     acc = 0.8747965825874695
-     eval_loss = 0.45516540421714036
-     global_step = 18408
-     loss = 0.04755385363816904
-
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000000000..46ff9270db7e2e
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,338 @@
+# Examples
+
+In this section a few examples are put together. All of these examples work for several models, making use of the very
+similar API between the different models.
+
+## Language model fine-tuning
+
+Based on the script `run_lm_finetuning.py`.
+
+Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
+to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
+are fine-tuned using a masked language modeling (MLM) loss.
+
+Before running the following example, you should get a file that contains text on which the language model will be
+fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+
+We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
+text that will be used for evaluation.
+
+### GPT-2/GPT and causal language modeling
+
+The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
+the tokenization). The loss here is that of causal language modeling.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE
+```
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
+a score of ~20 perplexity once fine-tuned on the dataset.
+
+### RoBERTa/BERT and masked language modeling
+
+The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
+as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
+pre-training: masked language modeling. 
+
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may therefore converge
+slower, but over-fitting would take more epochs.
+
+We use the `--mlm` flag so that the script may change its loss function.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=roberta \
+    --model_name_or_path=roberta-base \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm
+```
+
+## Language generation
+
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
+A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
+can try out the different models available in the library.
+
+Example usage:
+
+```bash
+python run_generation.py \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2
+```
+
+## GLUE
+
+Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
+Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
+
+GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
+uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8  V100 GPUs with a total train
+batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
+between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
+
+| Task  | Metric                       | Result      |
+|-------|------------------------------|-------------|
+| CoLA  | Matthew's corr               | 55.75       |
+| SST-2 | Accuracy                     | 92.09       |
+| MRPC  | F1/Accuracy                  | 90.48/86.27 |
+| STS-B | Person/Spearman corr.        | 89.03/88.64 |
+| QQP   | Accuracy/F1                  | 90.92/87.72 |
+| MNLI  | Matched acc./Mismatched acc. | 83.74/84.06 |
+| QNLI  | Accuracy                     | 91.07       |
+| RTE   | Accuracy                     | 68.59       |
+| WNLI  | Accuracy                     | 43.66       |
+
+Some of these results are significantly different from the ones reported on the test set
+of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+export TASK_NAME=MRPC
+
+python run_bert_classifier.py \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/$TASK_NAME \
+  --bert_model bert-base-uncased \
+  --max_seq_length 128 \
+  --train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
+
+The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
+In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
+output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
+
+The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
+CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
+said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
+since the data processor for each task inherits from the base class DataProcessor.
+
+### MRPC
+
+#### Fine-tuning example
+
+The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
+than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_bert_classifier.py \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --bert_model bert-base-uncased \
+  --max_seq_length 128 \
+  --train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/
+```
+
+Our test ran on a few seeds with [the original implementation hyper-
+parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
+results between 84% and 88%.
+
+#### Using Apex and mixed-precision
+
+Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
+[apex](https://github.com/NVIDIA/apex), then run the following example:
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_bert_classifier.py \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --bert_model bert-base-uncased \
+  --max_seq_length 128 \
+  --train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/ \
+  --fp16
+```
+
+#### Distributed training
+
+Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
+reaches F1 > 92 on MRPC.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_bert_classifier.py \
+    --bert_model bert-large-uncased-whole-word-masking \
+    --task_name MRPC \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MRPC/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+     --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+
+```bash
+acc = 0.8823529411764706
+acc_and_f1 = 0.901702786377709
+eval_loss = 0.3418912578906332
+f1 = 0.9210526315789473
+global_step = 174
+loss = 0.07231863956341798
+```
+
+### MNLI
+
+The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_bert_classifier.py \
+    --bert_model bert-large-uncased-whole-word-masking \
+    --task_name mnli \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MNLI/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir output_dir \
+```
+
+The results  are the following:
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+## SQuAD
+
+#### Fine-tuning on SQuAD
+
+This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
+on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
+$SQUAD_DIR directory.
+
+* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_bert_squad.py \
+  --bert_model bert-base-uncased \
+  --do_train \
+  --do_predict \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.52
+exact_match = 81.22
+```
+
+#### Distributed training
+
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+```bash
+   python -m torch.distributed.launch --nproc_per_node=8 \
+    run_bert_squad.py \
+    --bert_model bert-large-uncased-whole-word-masking  \
+    --do_train \
+    --do_predict \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --train_batch_size 24 \
+    --gradient_accumulation_steps 12
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuneds model is available as a checkpoint under the reference
+`bert-large-uncased-whole-word-masking-finetuned-squad`.
+
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index d37f7a443a3e28..a1995ae2242b35 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 using a masked language modeling (MLM) loss.
 """

From 5e151f5e7771b89989c9d553c2b014d84770a1c3 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 6 Sep 2019 12:08:36 -0400
Subject: [PATCH 106/119] Table of contents

---
 examples/README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 46ff9270db7e2e..a2236cf0f3a2d0 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,6 +3,13 @@
 In this section a few examples are put together. All of these examples work for several models, making use of the very
 similar API between the different models.
 
+| Section                    | Description                                                                                                                                                |
+|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
+| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
+| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
+| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                     |
+
 ## Language model fine-tuning
 
 Based on the script `run_lm_finetuning.py`.

From e52737d5ad4261686cf3bcf9ebd9e273ab5fb3e4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 6 Sep 2019 12:13:31 -0400
Subject: [PATCH 107/119] Updated docs README to feature the examples symlink

---
 docs/README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/README.md b/docs/README.md
index 1b3c1feade66bf..6804a22e6966ff 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -34,6 +34,13 @@ pip install recommonmark
 
 ## Building the documentation
 
+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig 
+command to generate it:
+
+```bash
+ln -s ../../examples/README.md source/examples.md
+```
+
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
 
 ```bash

From ee027c89f2b8fd0338df39b7e0b48345ea132e99 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 6 Sep 2019 23:40:05 +0300
Subject: [PATCH 108/119] fix #1165

---
 hubconf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hubconf.py b/hubconf.py
index 35e7f1eea83255..d9aaa6b53aa091 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
 from pytorch_transformers import (
     AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.modeling_utils import add_start_docstrings
+from pytorch_transformers.file_utils import add_start_docstrings
 
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 

From f47f9a5874df4fa450f5754574b007e50ee94506 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 6 Sep 2019 17:10:33 -0400
Subject: [PATCH 109/119] Updated outdated examples

---
 examples/README.md | 50 ++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index a2236cf0f3a2d0..4a8e1376b05a9c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,7 +12,7 @@ similar API between the different models.
 
 ## Language model fine-tuning
 
-Based on the script `run_lm_finetuning.py`.
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
 
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
@@ -52,8 +52,8 @@ The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using th
 as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
 pre-training: masked language modeling. 
 
-In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may therefore converge
-slower, but over-fitting would take more epochs.
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
+slightly slower (over-fitting takes more epochs).
 
 We use the `--mlm` flag so that the script may change its loss function.
 
@@ -74,6 +74,8 @@ python run_lm_finetuning.py \
 
 ## Language generation
 
+Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
@@ -88,6 +90,8 @@ python run_generation.py \
 
 ## GLUE
 
+Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
 
@@ -120,13 +124,14 @@ and unpack it to some directory `$GLUE_DIR`.
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC
 
-python run_bert_classifier.py \
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
   --do_lower_case \
   --data_dir $GLUE_DIR/$TASK_NAME \
-  --bert_model bert-base-uncased \
   --max_seq_length 128 \
   --train_batch_size 32 \
   --learning_rate 2e-5 \
@@ -160,13 +165,14 @@ and unpack it to some directory `$GLUE_DIR`.
 ```bash
 export GLUE_DIR=/path/to/glue
 
-python run_bert_classifier.py \
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
   --task_name MRPC \
   --do_train \
   --do_eval \
   --do_lower_case \
   --data_dir $GLUE_DIR/MRPC/ \
-  --bert_model bert-base-uncased \
   --max_seq_length 128 \
   --train_batch_size 32 \
   --learning_rate 2e-5 \
@@ -186,13 +192,14 @@ Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds.
 ```bash
 export GLUE_DIR=/path/to/glue
 
-python run_bert_classifier.py \
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
   --task_name MRPC \
   --do_train \
   --do_eval \
   --do_lower_case \
   --data_dir $GLUE_DIR/MRPC/ \
-  --bert_model bert-base-uncased \
   --max_seq_length 128 \
   --train_batch_size 32 \
   --learning_rate 2e-5 \
@@ -210,8 +217,9 @@ reaches F1 > 92 on MRPC.
 export GLUE_DIR=/path/to/glue
 
 python -m torch.distributed.launch \
-    --nproc_per_node 8 run_bert_classifier.py \
-    --bert_model bert-large-uncased-whole-word-masking \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
     --task_name MRPC \
     --do_train \
     --do_eval \
@@ -221,7 +229,7 @@ python -m torch.distributed.launch \
     --train_batch_size 8 \
     --learning_rate 2e-5 \
     --num_train_epochs 3.0 \
-     --output_dir /tmp/mrpc_output/
+    --output_dir /tmp/mrpc_output/
 ```
 
 Training with these hyper-parameters gave us the following results:
@@ -243,8 +251,9 @@ The following example uses the BERT-large, uncased, whole-word-masking model and
 export GLUE_DIR=/path/to/glue
 
 python -m torch.distributed.launch \
-    --nproc_per_node 8 run_bert_classifier.py \
-    --bert_model bert-large-uncased-whole-word-masking \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
     --task_name mnli \
     --do_train \
     --do_eval \
@@ -275,6 +284,8 @@ The results  are the following:
 
 ## SQuAD
 
+Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+
 #### Fine-tuning on SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
@@ -288,8 +299,9 @@ $SQUAD_DIR directory.
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
-python run_bert_squad.py \
-  --bert_model bert-base-uncased \
+python run_squad.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
   --do_train \
   --do_predict \
   --do_lower_case \
@@ -316,9 +328,9 @@ exact_match = 81.22
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
 ```bash
-   python -m torch.distributed.launch --nproc_per_node=8 \
-    run_bert_squad.py \
-    --bert_model bert-large-uncased-whole-word-masking  \
+python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
     --do_train \
     --do_predict \
     --do_lower_case \

From 3f91338be90f39d4b4845d5ea09da9cde7777177 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 6 Sep 2019 17:48:06 -0400
Subject: [PATCH 110/119] Patched a few outdated parameters

---
 examples/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 4a8e1376b05a9c..c47dc41433ae3a 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -133,7 +133,7 @@ python run_glue.py \
   --do_lower_case \
   --data_dir $GLUE_DIR/$TASK_NAME \
   --max_seq_length 128 \
-  --train_batch_size 32 \
+  --per_gpu_train_batch_size 32 \
   --learning_rate 2e-5 \
   --num_train_epochs 3.0 \
   --output_dir /tmp/$TASK_NAME/
@@ -174,7 +174,7 @@ python run_glue.py \
   --do_lower_case \
   --data_dir $GLUE_DIR/MRPC/ \
   --max_seq_length 128 \
-  --train_batch_size 32 \
+  --per_gpu_train_batch_size 32 \
   --learning_rate 2e-5 \
   --num_train_epochs 3.0 \
   --output_dir /tmp/mrpc_output/
@@ -201,7 +201,7 @@ python run_glue.py \
   --do_lower_case \
   --data_dir $GLUE_DIR/MRPC/ \
   --max_seq_length 128 \
-  --train_batch_size 32 \
+  --per_gpu_train_batch_size 32 \
   --learning_rate 2e-5 \
   --num_train_epochs 3.0 \
   --output_dir /tmp/mrpc_output/ \
@@ -226,7 +226,7 @@ python -m torch.distributed.launch \
     --do_lower_case \
     --data_dir $GLUE_DIR/MRPC/ \
     --max_seq_length 128 \
-    --train_batch_size 8 \
+    --per_gpu_train_batch_size 8 \
     --learning_rate 2e-5 \
     --num_train_epochs 3.0 \
     --output_dir /tmp/mrpc_output/
@@ -260,7 +260,7 @@ python -m torch.distributed.launch \
     --do_lower_case \
     --data_dir $GLUE_DIR/MNLI/ \
     --max_seq_length 128 \
-    --train_batch_size 8 \
+    --per_gpu_train_batch_size 8 \
     --learning_rate 2e-5 \
     --num_train_epochs 3.0 \
     --output_dir output_dir \
@@ -303,11 +303,11 @@ python run_squad.py \
   --model_type bert \
   --model_name_or_path bert-base-cased \
   --do_train \
-  --do_predict \
+  --do_eval \
   --do_lower_case \
   --train_file $SQUAD_DIR/train-v1.1.json \
   --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --train_batch_size 12 \
+  --per_gpu_train_batch_size 12 \
   --learning_rate 3e-5 \
   --num_train_epochs 2.0 \
   --max_seq_length 384 \
@@ -332,7 +332,7 @@ python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
     --model_type bert \
     --model_name_or_path bert-base-cased \
     --do_train \
-    --do_predict \
+    --do_eval \
     --do_lower_case \
     --train_file $SQUAD_DIR/train-v1.1.json \
     --predict_file $SQUAD_DIR/dev-v1.1.json \
@@ -341,7 +341,7 @@ python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
     --max_seq_length 384 \
     --doc_stride 128 \
     --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --train_batch_size 24 \
+    --per_gpu_train_batch_size 24 \
     --gradient_accumulation_steps 12
 ```
 

From 3401980fc4446d556ca075f6a627bdee33684341 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Sep 2019 10:22:12 +0300
Subject: [PATCH 111/119] fix #1208

---
 pytorch_transformers/modeling_xlnet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 3ac791b12194e1..d8744e95e769dd 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -504,8 +504,10 @@ def _init_weights(self, module):
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+            The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
+            the important thing is that they should be different for tokens which belong to different segments.
+            The model will compute relative segment differences from the given type indices:
+            0 if the segment id of two tokens are the same, 1 if not.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:

From e5df36397be0dfa41d723d14f913cd149231ed92 Mon Sep 17 00:00:00 2001
From: Rohit Kumar Singh <rohitku.singh8@gmail.com>
Date: Mon, 9 Sep 2019 19:55:57 +0530
Subject: [PATCH 112/119] changes in return statement of evaluate function

changed `results` to `result` and removed `results` dict defined previously
---
 examples/run_lm_finetuning.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index a1995ae2242b35..4d14fe7ebb50ce 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -247,7 +247,6 @@ def evaluate(args, model, tokenizer, prefix=""):
     # Loop to handle MNLI double evaluation (matched, mis-matched)
     eval_output_dir = args.output_dir
 
-    results = {}
     eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
 
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
@@ -289,7 +288,7 @@ def evaluate(args, model, tokenizer, prefix=""):
             logger.info("  %s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
 
-    return results
+    return result
 
 
 def main():

From a95ced6260494f108df72f0f9ffe9c60498365ad Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Mon, 9 Sep 2019 19:53:35 +0000
Subject: [PATCH 113/119] [Distillation] save last chkpt as `pytorch_model.bin`

---
 examples/distillation/distiller.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index ed710a2bee6c7d..93135e292c6fe8 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -295,7 +295,10 @@ def train(self):
             if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
             self.end_epoch()
 
-        if self.is_master: logger.info('Training is finished')
+        if self.is_master:
+            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
+            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
+            logger.info('Training is finished')
 
     def step(self,
              input_ids: torch.tensor,

From eab980fd684d82e7e2f9e045e570389fe4f7e693 Mon Sep 17 00:00:00 2001
From: searchivarius <leo@boytsov.info>
Date: Mon, 9 Sep 2019 19:58:08 -0400
Subject: [PATCH 114/119] Fix to prevent crashing on assert len(tokens_b)>=1

---
 examples/lm_finetuning/pregenerate_training_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index ff40d95f751f39..2906c6fd2bf8ce 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -329,7 +329,8 @@ def main():
                     doc = []
                 else:
                     tokens = tokenizer.tokenize(line)
-                    doc.append(tokens)
+                    if tokens:
+                        doc.append(tokens)
             if doc:
                 docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
         if len(docs) <= 1:

From f2cf6ce4a9b97d570b0b17f5855a44bab437ea19 Mon Sep 17 00:00:00 2001
From: mattolson93 <32203230+mattolson93@users.noreply.github.com>
Date: Tue, 10 Sep 2019 09:12:01 -0700
Subject: [PATCH 115/119] Fixing typo in gpt2 for doc site's class link

---
 pytorch_transformers/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 42686411875e77..bbb0bed3b1f4df 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -290,7 +290,7 @@ def _init_weights(self, module):
             Indices of input sequence tokens in the vocabulary.
             GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **past**:

From 364920e216c16d73c782a61a4cf6652e541fbe18 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Tue, 10 Sep 2019 21:45:01 +0000
Subject: [PATCH 116/119] fix small bug/typo

---
 examples/distillation/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/dataset.py b/examples/distillation/dataset.py
index cdc16b94f34ca4..89e3f1187f5f6a 100644
--- a/examples/distillation/dataset.py
+++ b/examples/distillation/dataset.py
@@ -77,7 +77,7 @@ def divide_chunks(l, n):
                     if sub_s[0] != cls_id:
                         sub_s = np.insert(sub_s, 0, cls_id)
                     if sub_s[-1] != sep_id:
-                        sub_s = np.insert(sub_s, len(sub_s), cls_id)
+                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
                     assert len(sub_s) <= max_len
                     sub_seqs.append(sub_s)
 

From 8bdee1cb733d3c735c64a1ab5e7b41cec02d8fa3 Mon Sep 17 00:00:00 2001
From: Zili Wang <wzlnot@gmail.com>
Date: Wed, 11 Sep 2019 15:41:53 +0800
Subject: [PATCH 117/119] fixed: hard coding for max and min number will out of
 range in fp16, which will cause nan.

---
 pytorch_transformers/modeling_transfo_xl.py | 34 +++++++++++++--------
 pytorch_transformers/modeling_utils.py      |  7 +++--
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 7ad9d10891fbb6..228f75fc82064f 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -231,7 +231,7 @@ def forward(self, inp):
 
 
 class MultiHeadAttn(nn.Module):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
                  pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
         super(MultiHeadAttn, self).__init__()
 
@@ -451,11 +451,19 @@ def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
         if attn_mask is not None and torch.sum(attn_mask).item():
             attn_mask = (attn_mask == 1)  # Switch to bool
             if attn_mask.dim() == 2:
-                attn_score = attn_score.float().masked_fill(
-                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)
+                if next(self.parameters()).dtype == torch.float16:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[None,:,:,None], -65000).type_as(attn_score)
+                else:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[None,:,:,None], -1e30).type_as(attn_score)
             elif attn_mask.dim() == 3:
-                attn_score = attn_score.float().masked_fill(
-                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)
+                if next(self.parameters()).dtype == torch.float16:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[:,:,:,None], -65000).type_as(attn_score)
+                else:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[:,:,:,None], -1e30).type_as(attn_score)
 
         # [qlen x klen x bsz x n_head]
         attn_prob = F.softmax(attn_score, dim=1)
@@ -587,7 +595,7 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
         super(DecoderLayer, self).__init__()
 
         self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
     def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
@@ -607,7 +615,7 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout,
 
         self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
                                          **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
     def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
@@ -628,7 +636,7 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout,
 
         self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
                             d_head, dropout, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
@@ -645,7 +653,7 @@ def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
 
 
 class AdaptiveEmbedding(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
                  sample_softmax=False):
         super(AdaptiveEmbedding, self).__init__()
 
@@ -683,7 +691,7 @@ def forward(self, inp):
         else:
             param = next(self.parameters())
             inp_flat = inp.view(-1)
-            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
                 dtype=param.dtype, device=param.device)
             for i in range(len(self.cutoffs)):
                 l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
@@ -852,7 +860,7 @@ def __init__(self, config):
         self.n_head = config.n_head
         self.d_head = config.d_head
 
-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
                                           div_val=config.div_val)
 
         self.drop = nn.Dropout(config.dropout)
@@ -1011,7 +1019,7 @@ def _forward(self, dec_inp, mems=None, head_mask=None):
         hids = []
         attentions = []
         if self.attn_type == 0: # default
-            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
                                    dtype=word_emb.dtype)
             if self.clamp_len > 0:
                 pos_seq.clamp_(max=self.clamp_len)
@@ -1165,7 +1173,7 @@ def __init__(self, config):
             self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
+            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
                                                     config.cutoffs, div_val=config.div_val)
         self.init_weights()
         self.tie_weights()
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 2fb4671674a2c7..25aeefe10f1369 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -140,7 +140,7 @@ def resize_token_embeddings(self, new_num_tokens=None):
         Arguments:
 
             new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
                 If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
 
         Return: ``torch.nn.Embeddings``
@@ -434,7 +434,10 @@ def forward(self, hidden_states, p_mask=None):
         x = self.dense(hidden_states).squeeze(-1)
 
         if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
 
         return x
 

From 11ac4b955514ac2fbd105121863afd23205c7c12 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Sep 2019 10:13:44 +0200
Subject: [PATCH 118/119] [CI] Symbolic link for documentation

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 48e80beaebbe18..f42e3023afbbc0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -37,6 +37,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
+            - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
             - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 workflow_filters: &workflow_filters
     filters:

From 32e1332acf6fd1ad372b81c296d43be441d3b0b1 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 11 Sep 2019 14:19:07 +0000
Subject: [PATCH 119/119] [distil] fix once for all general logger for scripts

---
 examples/distillation/scripts/binarized_data.py | 8 ++++++--
 examples/distillation/scripts/token_counts.py   | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 792a5692e442ae..51be8fd0be565d 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -21,8 +21,12 @@
 import time
 import numpy as np
 from pytorch_transformers import BertTokenizer
+import logging
 
-from examples.distillation.utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
 
 def main():
     parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
@@ -74,4 +78,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index d791c66be30628..a484a6f51b3f65 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -18,8 +18,12 @@
 from collections import Counter
 import argparse
 import pickle
+import logging
 
-from examples.distillation.utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")