From e0a2da2dc782f8402ca7314d20dd9c054dae44be Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Tue, 17 Dec 2019 18:24:39 -0500 Subject: [PATCH 01/12] Adding BERT for MS-MARCO passage re-ranking pretrained model --- texar/torch/modules/pretrained/bert.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 5f3de240e..eaf78ceea 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -34,6 +34,7 @@ _BIOBERT_PATH = "https://github.com/naver/biobert-pretrained/releases/download/" _SCIBERT_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/" \ "scibert/tensorflow_models/" +_BERT_MSMARCO_PATH = "https://drive.google.com/file/d/" class PretrainedBERTMixin(PretrainedMixin, ABC): @@ -97,6 +98,16 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): * ``scibert-basevocab-cased``: Cased version of the model trained on the original BERT vocabulary. + * **BERT for MS-MARCO**: proposed in (`Nogueira et al`. 2019) + `Passage Re-ranking with BERT`_. A BERT model fine-tuned on MS-MARCO + (Nguyen et al., 2016) dataset. It's the best performing model (on Jan 8th + 2019) on MS-MARCO Passage re-ranking task. Two models are included: + + * ``bert-msmarco-base``: Original BERT base model fine-tuned on + MS-MARCO. + * ``bert-msmarco-large``: Original BERT large model fine-tuned on + MS-MARCO. + We provide the following BERT classes: * :class:`~texar.torch.modules.BERTEncoder` for text encoding. @@ -111,6 +122,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): .. _`SciBERT: A Pretrained Language Model for Scientific Text`: https://arxiv.org/abs/1903.10676 + + .. _`BERT for MS-MARCO: Passage re-ranking with BERT`: + https://arxiv.org/abs/1901.04085 """ _MODEL_NAME = "BERT" @@ -150,6 +164,12 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): _SCIBERT_PATH + 'scibert_basevocab_uncased.tar.gz', 'scibert-basevocab-cased': _SCIBERT_PATH + 'scibert_basevocab_cased.tar.gz', + + # BERT for MS-MARCO + 'bert-msmarco-base': + _BERT_MSMARCO_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/view', + 'bert-msmarco-large': + _BERT_MSMARCO_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/view' } _MODEL2CKPT = { # Standard BERT @@ -172,6 +192,10 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): 'scibert-scivocab-cased': 'bert_model.ckpt', 'scibert-basevocab-uncased': 'bert_model.ckpt', 'scibert-basevocab-cased': 'bert_model.ckpt', + + # BERT for MSMARCO + 'bert-msmarco-base': 'model.ckpt-100000', + 'bert-msmarco-large': 'model.ckpt-100000', } @classmethod From 90a06a18900f087f66f4d9396f32ef5d3d1fa2fa Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 11:05:41 -0500 Subject: [PATCH 02/12] Adding logits layer weights and bias --- texar/torch/modules/pretrained/bert.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index eaf78ceea..2271d7f4e 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -167,9 +167,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): # BERT for MS-MARCO 'bert-msmarco-base': - _BERT_MSMARCO_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/view', + _BERT_MSMARCO_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/', 'bert-msmarco-large': - _BERT_MSMARCO_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/view' + _BERT_MSMARCO_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/' } _MODEL2CKPT = { # Standard BERT @@ -325,7 +325,9 @@ def _init_from_checkpoint(self, pretrained_model_name: str, } pooler_map = { 'bert/pooler/dense/bias': 'pooler.0.bias', - 'bert/pooler/dense/kernel': 'pooler.0.weight' + 'bert/pooler/dense/kernel': 'pooler.0.weight', + 'output_bias': '_logits_layer.bias', + 'output_weights': '_logits_layer.weight', } tf_path = os.path.abspath(os.path.join( cache_dir, self._MODEL2CKPT[pretrained_model_name])) From 014c558b0d1f0d6f1a5558357400206240df9a1d Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 12:03:20 -0500 Subject: [PATCH 03/12] Making the PretrainedMixin work for both encoder and classifier --- .../modules/classifiers/bert_classifier.py | 4 +++ texar/torch/modules/pretrained/bert.py | 26 ++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/texar/torch/modules/classifiers/bert_classifier.py b/texar/torch/modules/classifiers/bert_classifier.py index 5303e7259..a0dc2f8e8 100644 --- a/texar/torch/modules/classifiers/bert_classifier.py +++ b/texar/torch/modules/classifiers/bert_classifier.py @@ -71,6 +71,8 @@ def __init__(self, super().__init__(hparams=hparams) + self.load_pretrained_config(pretrained_model_name, cache_dir) + # Create the underlying encoder encoder_hparams = dict_fetch(hparams, self._ENCODER_CLASS.default_hparams()) @@ -120,6 +122,8 @@ def __init__(self, (self.num_classes <= 0 and self._hparams.encoder.dim == 1) + self.init_pretrained_weights(class_type='classifier') + @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 2271d7f4e..5341fd875 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -326,12 +326,20 @@ def _init_from_checkpoint(self, pretrained_model_name: str, pooler_map = { 'bert/pooler/dense/bias': 'pooler.0.bias', 'bert/pooler/dense/kernel': 'pooler.0.weight', + } + classifier_map = { 'output_bias': '_logits_layer.bias', 'output_weights': '_logits_layer.weight', } + global_prefix_map = { + 'classifier': '_encoder.' + } tf_path = os.path.abspath(os.path.join( cache_dir, self._MODEL2CKPT[pretrained_model_name])) + class_type = kwargs.get('class_type', 'encoder') + global_prefix = global_prefix_map.get(class_type, '') + # Load weights from TF model init_vars = tf.train.list_variables(tf_path) tfnames, arrays = [], [] @@ -351,13 +359,14 @@ def _init_from_checkpoint(self, pretrained_model_name: str, continue if name in global_tensor_map: - v_name = global_tensor_map[name] + v_name = global_prefix + global_tensor_map[name] pointer = self._name_to_variable(v_name) assert pointer.shape == array.shape pointer.data = torch.from_numpy(array) idx += 1 elif name in pooler_map: - pointer = self._name_to_variable(pooler_map[name]) + pointer = self._name_to_variable(global_prefix + + pooler_map[name]) if name.endswith('bias'): assert pointer.shape == array.shape pointer.data = torch.from_numpy(array) @@ -367,6 +376,13 @@ def _init_from_checkpoint(self, pretrained_model_name: str, assert pointer.shape == array_t.shape pointer.data = torch.from_numpy(array_t) idx += 1 + elif name in classifier_map: + if class_type != 'classifier': + continue + pointer = self._name_to_variable(classifier_map[name]) + assert pointer.shape == array.shape + pointer.data = torch.from_numpy(array) + idx += 1 else: # here name is the TensorFlow variable name name_tmp = name.split("/") @@ -375,12 +391,14 @@ def _init_from_checkpoint(self, pretrained_model_name: str, name_tmp = "/".join(name_tmp[3:]) if name_tmp in layer_tensor_map: v_name = layer_tensor_map[name_tmp].format(layer_no) - pointer = self._name_to_variable(py_prefix + v_name) + pointer = self._name_to_variable(global_prefix + + py_prefix + v_name) assert pointer.shape == array.shape pointer.data = torch.from_numpy(array) elif name_tmp in layer_transpose_map: v_name = layer_transpose_map[name_tmp].format(layer_no) - pointer = self._name_to_variable(py_prefix + v_name) + pointer = self._name_to_variable(global_prefix + + py_prefix + v_name) array_t = np.transpose(array) assert pointer.shape == array_t.shape pointer.data = torch.from_numpy(array_t) From 8938520c6032984a788cdd3ba8a2ab5e1669db7a Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 13:31:27 -0500 Subject: [PATCH 04/12] Adding tokenizer part --- texar/torch/data/tokenizers/bert_tokenizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/texar/torch/data/tokenizers/bert_tokenizer.py b/texar/torch/data/tokenizers/bert_tokenizer.py index 1d0f244ff..c5824989c 100644 --- a/texar/torch/data/tokenizers/bert_tokenizer.py +++ b/texar/torch/data/tokenizers/bert_tokenizer.py @@ -74,6 +74,10 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase): 'scibert-scivocab-cased': 512, 'scibert-basevocab-uncased': 512, 'scibert-basevocab-cased': 512, + + # BERT for MS-MARCO + 'bert-msmarco-base': 512, + 'bert-msmarco-large': 512, } _VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'} _VOCAB_FILE_MAP = { @@ -98,6 +102,10 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase): 'scibert-scivocab-cased': 'vocab.txt', 'scibert-basevocab-uncased': 'vocab.txt', 'scibert-basevocab-cased': 'vocab.txt', + + # BERT for MS-MARCO + 'bert-msmarco-base': 'vocab.txt', + 'bert-msmarco-large': 'vocab.txt', } } From 703896f54b54ed56665d624b8d360b4fe495320e Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 14:19:12 -0500 Subject: [PATCH 05/12] docstring --- texar/torch/modules/pretrained/bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 5341fd875..5da028057 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -123,7 +123,7 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): .. _`SciBERT: A Pretrained Language Model for Scientific Text`: https://arxiv.org/abs/1903.10676 - .. _`BERT for MS-MARCO: Passage re-ranking with BERT`: + .. _`Passage re-ranking with BERT`: https://arxiv.org/abs/1901.04085 """ From e4a97381ecc1a2a582b6f404ca976813e7ed6678 Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 14:43:00 -0500 Subject: [PATCH 06/12] Spelling --- texar/torch/modules/pretrained/bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 5da028057..088f8864e 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -123,7 +123,7 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): .. _`SciBERT: A Pretrained Language Model for Scientific Text`: https://arxiv.org/abs/1903.10676 - .. _`Passage re-ranking with BERT`: + .. _`Passage Re-ranking with BERT`: https://arxiv.org/abs/1901.04085 """ From 67b1951a4cfa5e3dbeff6d5a8267b8695b48b45a Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 15:17:22 -0500 Subject: [PATCH 07/12] Spelling --- texar/torch/modules/pretrained/bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 088f8864e..c327601c6 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -100,7 +100,7 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): * **BERT for MS-MARCO**: proposed in (`Nogueira et al`. 2019) `Passage Re-ranking with BERT`_. A BERT model fine-tuned on MS-MARCO - (Nguyen et al., 2016) dataset. It's the best performing model (on Jan 8th + (`Nguyen et al`., 2016) dataset. It's the best performing model (on Jan 8th 2019) on MS-MARCO Passage re-ranking task. Two models are included: * ``bert-msmarco-base``: Original BERT base model fine-tuned on From e8663fd1f22b37e16f478573fa4ffa988929e152 Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Fri, 20 Dec 2019 15:34:03 -0500 Subject: [PATCH 08/12] Lint --- texar/torch/modules/pretrained/bert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index c327601c6..ba3016a9f 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -100,8 +100,8 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): * **BERT for MS-MARCO**: proposed in (`Nogueira et al`. 2019) `Passage Re-ranking with BERT`_. A BERT model fine-tuned on MS-MARCO - (`Nguyen et al`., 2016) dataset. It's the best performing model (on Jan 8th - 2019) on MS-MARCO Passage re-ranking task. Two models are included: + (`Nguyen et al`., 2016) dataset. It's the best performing model (on Jan + 8th 2019) on MS-MARCO Passage re-ranking task. Two models are included: * ``bert-msmarco-base``: Original BERT base model fine-tuned on MS-MARCO. From 31d3e07a478f9fe5b30132bb57fccbf1ea27104f Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Mon, 23 Dec 2019 14:10:24 -0500 Subject: [PATCH 09/12] Changing name --- texar/torch/data/tokenizers/bert_tokenizer.py | 8 ++++---- texar/torch/modules/pretrained/bert.py | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/texar/torch/data/tokenizers/bert_tokenizer.py b/texar/torch/data/tokenizers/bert_tokenizer.py index c5824989c..f11dab9be 100644 --- a/texar/torch/data/tokenizers/bert_tokenizer.py +++ b/texar/torch/data/tokenizers/bert_tokenizer.py @@ -76,8 +76,8 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase): 'scibert-basevocab-cased': 512, # BERT for MS-MARCO - 'bert-msmarco-base': 512, - 'bert-msmarco-large': 512, + 'bert-msmarco-nogueira19-base': 512, + 'bert-msmarco-nogueira19-large': 512, } _VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'} _VOCAB_FILE_MAP = { @@ -104,8 +104,8 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase): 'scibert-basevocab-cased': 'vocab.txt', # BERT for MS-MARCO - 'bert-msmarco-base': 'vocab.txt', - 'bert-msmarco-large': 'vocab.txt', + 'bert-msmarco-nogueira19-base': 'vocab.txt', + 'bert-msmarco-nogueira19-large': 'vocab.txt', } } diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index ba3016a9f..1b18a722d 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -34,7 +34,7 @@ _BIOBERT_PATH = "https://github.com/naver/biobert-pretrained/releases/download/" _SCIBERT_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/" \ "scibert/tensorflow_models/" -_BERT_MSMARCO_PATH = "https://drive.google.com/file/d/" +_BERT_MSMARCO_NOGUEIRA19_PATH = "https://drive.google.com/file/d/" class PretrainedBERTMixin(PretrainedMixin, ABC): @@ -103,9 +103,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): (`Nguyen et al`., 2016) dataset. It's the best performing model (on Jan 8th 2019) on MS-MARCO Passage re-ranking task. Two models are included: - * ``bert-msmarco-base``: Original BERT base model fine-tuned on + * ``bert-msmarco-nogueira19-base``: Original BERT base model fine-tuned on MS-MARCO. - * ``bert-msmarco-large``: Original BERT large model fine-tuned on + * ``bert-msmarco-nogueira19-large``: Original BERT large model fine-tuned on MS-MARCO. We provide the following BERT classes: @@ -167,9 +167,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): # BERT for MS-MARCO 'bert-msmarco-base': - _BERT_MSMARCO_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/', + _BERT_MSMARCO_NOGUEIRA19_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/', 'bert-msmarco-large': - _BERT_MSMARCO_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/' + _BERT_MSMARCO_NOGUEIRA19_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/' } _MODEL2CKPT = { # Standard BERT @@ -194,8 +194,8 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): 'scibert-basevocab-cased': 'bert_model.ckpt', # BERT for MSMARCO - 'bert-msmarco-base': 'model.ckpt-100000', - 'bert-msmarco-large': 'model.ckpt-100000', + 'bert-msmarco-nogueira19-base': 'model.ckpt-100000', + 'bert-msmarco-nogueira19-large': 'model.ckpt-100000', } @classmethod From 824d947cc4b38306ca4afe675fe9ea1f200d0f83 Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Mon, 23 Dec 2019 14:23:46 -0500 Subject: [PATCH 10/12] Changing name --- texar/torch/modules/pretrained/bert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 1b18a722d..3a9be7563 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -166,9 +166,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): _SCIBERT_PATH + 'scibert_basevocab_cased.tar.gz', # BERT for MS-MARCO - 'bert-msmarco-base': + 'bert-msmarco-nogueira19-base': _BERT_MSMARCO_NOGUEIRA19_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/', - 'bert-msmarco-large': + 'bert-msmarco-nogueira19-large': _BERT_MSMARCO_NOGUEIRA19_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/' } _MODEL2CKPT = { From 7d56607f3cd3e90c4207fa6ebc77c64c73c57156 Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Mon, 23 Dec 2019 15:56:24 -0500 Subject: [PATCH 11/12] Avoiding duplicate downloads --- texar/torch/modules/classifiers/bert_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/texar/torch/modules/classifiers/bert_classifier.py b/texar/torch/modules/classifiers/bert_classifier.py index a0dc2f8e8..9cc0a1b6f 100644 --- a/texar/torch/modules/classifiers/bert_classifier.py +++ b/texar/torch/modules/classifiers/bert_classifier.py @@ -76,9 +76,10 @@ def __init__(self, # Create the underlying encoder encoder_hparams = dict_fetch(hparams, self._ENCODER_CLASS.default_hparams()) + encoder_hparams['pretrained_model_name'] = None self._encoder = self._ENCODER_CLASS( - pretrained_model_name=pretrained_model_name, + pretrained_model_name=None, cache_dir=cache_dir, hparams=encoder_hparams) From 6e3920a2a86c0f340c52289edf849789d6a95e2c Mon Sep 17 00:00:00 2001 From: Atif Ahmed Date: Thu, 26 Dec 2019 13:58:22 -0500 Subject: [PATCH 12/12] lint --- texar/torch/modules/pretrained/bert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py index 3a9be7563..07db3da44 100644 --- a/texar/torch/modules/pretrained/bert.py +++ b/texar/torch/modules/pretrained/bert.py @@ -103,10 +103,10 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): (`Nguyen et al`., 2016) dataset. It's the best performing model (on Jan 8th 2019) on MS-MARCO Passage re-ranking task. Two models are included: - * ``bert-msmarco-nogueira19-base``: Original BERT base model fine-tuned on - MS-MARCO. - * ``bert-msmarco-nogueira19-large``: Original BERT large model fine-tuned on - MS-MARCO. + * ``bert-msmarco-nogueira19-base``: Original BERT base model fine-tuned + on MS-MARCO. + * ``bert-msmarco-nogueira19-large``: Original BERT large model + fine-tuned on MS-MARCO. We provide the following BERT classes: @@ -167,7 +167,7 @@ class PretrainedBERTMixin(PretrainedMixin, ABC): # BERT for MS-MARCO 'bert-msmarco-nogueira19-base': - _BERT_MSMARCO_NOGUEIRA19_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX/', + _BERT_MSMARCO_NOGUEIRA19_PATH + '1cyUrhs7JaCJTTu-DjFUqP6Bs4f8a6JTX', 'bert-msmarco-nogueira19-large': _BERT_MSMARCO_NOGUEIRA19_PATH + '1crlASTMlsihALlkabAQP6JTYIZwC1Wm8/' }