diff --git a/.codecov.yml b/.codecov.yml index fc2f9ad47..ef5e6772a 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -10,7 +10,7 @@ coverage: project: default: threshold: 1% - patch: yes + patch: off changes: no parsers: diff --git a/docs/requirements.txt b/docs/requirements.txt index 2ec228522..697e1ea35 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,6 @@ sphinx sphinx-rtd-theme >= 0.2.4 -Pygments >= 2.1.1 +Pygments >= 2.1.1, < 2.5.1 funcsigs recommonmark mypy_extensions diff --git a/texar/torch/data/tokenizers/bert_tokenizer.py b/texar/torch/data/tokenizers/bert_tokenizer.py index 231555289..1d0f244ff 100644 --- a/texar/torch/data/tokenizers/bert_tokenizer.py +++ b/texar/torch/data/tokenizers/bert_tokenizer.py @@ -76,6 +76,30 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase): 'scibert-basevocab-cased': 512, } _VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'} + _VOCAB_FILE_MAP = { + 'vocab_file': { + # Standard BERT + 'bert-base-uncased': 'vocab.txt', + 'bert-large-uncased': 'vocab.txt', + 'bert-base-cased': 'vocab.txt', + 'bert-large-cased': 'vocab.txt', + 'bert-base-multilingual-uncased': 'vocab.txt', + 'bert-base-multilingual-cased': 'vocab.txt', + 'bert-base-chinese': 'vocab.txt', + + # BioBERT + 'biobert-v1.0-pmc': 'vocab.txt', + 'biobert-v1.0-pubmed-pmc': 'vocab.txt', + 'biobert-v1.0-pubmed': 'vocab.txt', + 'biobert-v1.1-pubmed': 'vocab.txt', + + # SciBERT + 'scibert-scivocab-uncased': 'vocab.txt', + 'scibert-scivocab-cased': 'vocab.txt', + 'scibert-basevocab-uncased': 'vocab.txt', + 'scibert-basevocab-cased': 'vocab.txt', + } + } def __init__(self, pretrained_model_name: Optional[str] = None, @@ -93,8 +117,10 @@ def __init__(self, } if self.pretrained_model_dir is not None: + assert self.pretrained_model_name is not None vocab_file = os.path.join(self.pretrained_model_dir, - self._VOCAB_FILE_NAMES['vocab_file']) + self._VOCAB_FILE_MAP['vocab_file'] + [self.pretrained_model_name]) assert self.pretrained_model_name is not None if self._MAX_INPUT_SIZE.get(self.pretrained_model_name): self.max_len = self._MAX_INPUT_SIZE[self.pretrained_model_name] diff --git a/texar/torch/data/tokenizers/gpt2_tokenizer.py b/texar/torch/data/tokenizers/gpt2_tokenizer.py index d7bfb7eb6..34de84365 100644 --- a/texar/torch/data/tokenizers/gpt2_tokenizer.py +++ b/texar/torch/data/tokenizers/gpt2_tokenizer.py @@ -70,6 +70,24 @@ class GPT2Tokenizer(TokenizerBase, PretrainedGPT2Mixin): 'vocab_file': 'encoder.json', 'merges_file': 'vocab.bpe', } + _VOCAB_FILE_MAP = { + 'vocab_file': { + 'gpt2-small': 'encoder.json', + 'gpt2-medium': 'encoder.json', + 'gpt2-large': 'encoder.json', + 'gpt2-xl': 'encoder.json', + '117M': 'encoder.json', + '345M': 'encoder.json', + }, + 'merges_file': { + 'gpt2-small': 'vocab.bpe', + 'gpt2-medium': 'vocab.bpe', + 'gpt2-large': 'vocab.bpe', + 'gpt2-xl': 'vocab.bpe', + '117M': 'vocab.bpe', + '345M': 'vocab.bpe', + }, + } def __init__(self, pretrained_model_name: Optional[str] = None, @@ -84,10 +102,13 @@ def __init__(self, } if self.pretrained_model_dir is not None: + assert self.pretrained_model_name is not None vocab_file = os.path.join(self.pretrained_model_dir, - self._VOCAB_FILE_NAMES['vocab_file']) + self._VOCAB_FILE_MAP['vocab_file'] + [self.pretrained_model_name]) merges_file = os.path.join(self.pretrained_model_dir, - self._VOCAB_FILE_NAMES['merges_file']) + self._VOCAB_FILE_MAP['merges_file'] + [self.pretrained_model_name]) assert pretrained_model_name is not None if self._MAX_INPUT_SIZE.get(pretrained_model_name): self.max_len = self._MAX_INPUT_SIZE[pretrained_model_name] @@ -119,9 +140,8 @@ def __init__(self, # Should haved added re.IGNORECASE so BPE merges can happen for # capitalized versions of contractions - self.pat = re.compile( - r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ? - [^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| """ + + r""""?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") def _map_text_to_token(self, text: str) -> List[str]: # type: ignore r"""Tokenize a string. """ @@ -301,6 +321,7 @@ def default_hparams() -> Dict[str, Any]: "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>", "errors": "replace", + "name": "gpt2_tokenizer", } Here: @@ -332,6 +353,9 @@ def default_hparams() -> Dict[str, Any]: `"errors"`: str Response when mapping tokens to text fails. The possible values are `ignore`, `replace`, and `strict`. + + `"name"`: str + Name of the tokenizer. """ return { 'pretrained_model_name': '117M', @@ -343,6 +367,7 @@ def default_hparams() -> Dict[str, Any]: 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'errors': 'replace', + 'name': 'gpt2_tokenizer', '@no_typecheck': ['pretrained_model_name'], } diff --git a/texar/torch/data/tokenizers/roberta_tokenizer.py b/texar/torch/data/tokenizers/roberta_tokenizer.py index 67cd44f4d..4dfec8805 100644 --- a/texar/torch/data/tokenizers/roberta_tokenizer.py +++ b/texar/torch/data/tokenizers/roberta_tokenizer.py @@ -24,8 +24,7 @@ 'RoBERTaTokenizer', ] -_GPT2_PATH = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/" -_CHECKPOINT_FILES = ["encoder.json", "vocab.bpe"] +_ROBERTA_PATH = "https://s3.amazonaws.com/models.huggingface.co/bert/" class RoBERTaTokenizer(GPT2Tokenizer): @@ -48,13 +47,29 @@ class RoBERTaTokenizer(GPT2Tokenizer): """ _MODEL2URL = { - 'roberta-base': [_GPT2_PATH + f"{file}" for file in _CHECKPOINT_FILES], - 'roberta-large': [_GPT2_PATH + f"{file}" for file in _CHECKPOINT_FILES], + 'roberta-base': [ + _ROBERTA_PATH + 'roberta-base-vocab.json', + _ROBERTA_PATH + 'roberta-base-merges.txt', + ], + 'roberta-large': [ + _ROBERTA_PATH + 'roberta-large-vocab.json', + _ROBERTA_PATH + 'roberta-large-merges.txt', + ], } _MAX_INPUT_SIZE = { 'roberta-base': 512, 'roberta-large': 512, } + _VOCAB_FILE_MAP = { + 'vocab_file': { + 'roberta-base': 'roberta-base-vocab.json', + 'roberta-large': 'roberta-large-vocab.json', + }, + 'merges_file': { + 'roberta-base': 'roberta-base-merges.txt', + 'roberta-large': 'roberta-large-merges.txt', + }, + } def encode_text(self, # type: ignore text_a: str, @@ -153,6 +168,7 @@ def default_hparams() -> Dict[str, Any]: "pad_token": "", "mask_token": "", "errors": "replace", + "name": "roberta_tokenizer", } Here: @@ -193,6 +209,9 @@ def default_hparams() -> Dict[str, Any]: `"errors"`: str Response when decoding fails. The possible values are `ignore`, `replace`, and `strict`. + + `"name"`: str + Name of the tokenizer. """ return { 'pretrained_model_name': 'roberta-base', @@ -207,6 +226,7 @@ def default_hparams() -> Dict[str, Any]: 'pad_token': '', 'mask_token': '', 'errors': 'replace', + 'name': 'roberta_tokenizer', '@no_typecheck': ['pretrained_model_name'], } diff --git a/texar/torch/data/tokenizers/tokenizer_base.py b/texar/torch/data/tokenizers/tokenizer_base.py index 5fb524c3c..fbbebf4a8 100644 --- a/texar/torch/data/tokenizers/tokenizer_base.py +++ b/texar/torch/data/tokenizers/tokenizer_base.py @@ -53,6 +53,7 @@ class TokenizerBase(ModuleBase): _IS_PRETRAINED: bool _MAX_INPUT_SIZE: Dict[str, Optional[int]] _VOCAB_FILE_NAMES: Dict[str, str] + _VOCAB_FILE_MAP: Dict[str, Dict[str, str]] _SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", "additional_special_tokens"] diff --git a/texar/torch/data/tokenizers/xlnet_tokenizer.py b/texar/torch/data/tokenizers/xlnet_tokenizer.py index abdeed620..b79660ff8 100644 --- a/texar/torch/data/tokenizers/xlnet_tokenizer.py +++ b/texar/torch/data/tokenizers/xlnet_tokenizer.py @@ -67,6 +67,12 @@ class XLNetTokenizer(PretrainedXLNetMixin, TokenizerBase): 'xlnet-large-cased': None, } _VOCAB_FILE_NAMES = {'vocab_file': 'spiece.model'} + _VOCAB_FILE_MAP = { + 'vocab_file': { + 'xlnet-base-cased': 'spiece.model', + 'xlnet-large-cased': 'spiece.model', + } + } def __init__(self, pretrained_model_name: Optional[str] = None, @@ -85,8 +91,10 @@ def __init__(self, } if self.pretrained_model_dir is not None: + assert self.pretrained_model_name is not None vocab_file = os.path.join(self.pretrained_model_dir, - self._VOCAB_FILE_NAMES['vocab_file']) + self._VOCAB_FILE_MAP['vocab_file'] + [self.pretrained_model_name]) assert pretrained_model_name is not None if self._MAX_INPUT_SIZE.get(pretrained_model_name): self.max_len = self._MAX_INPUT_SIZE[pretrained_model_name] @@ -304,6 +312,7 @@ def default_hparams() -> Dict[str, Any]: "do_lower_case": False, "remove_space": True, "keep_accents": False, + "name": "xlnet_tokenizer", } Here: @@ -349,6 +358,9 @@ def default_hparams() -> Dict[str, Any]: `"keep_accents"`: bool Whether to keep the accents in the text. + + `"name"`: str + Name of the tokenizer. """ return { 'pretrained_model_name': 'xlnet-base-cased', @@ -365,6 +377,7 @@ def default_hparams() -> Dict[str, Any]: 'do_lower_case': False, 'remove_space': True, 'keep_accents': False, + 'name': 'xlnet_tokenizer', '@no_typecheck': ['pretrained_model_name'], }