diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index eea85b00cd777a..5a6b960dbba852 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -20,18 +20,36 @@ logger = logging.get_logger(__name__) -# vocab and merges same as roberta -vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" -merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" -_all_bart_models = [ - "facebook/bart-base", - "facebook/bart-large", - "facebook/bart-large-mnli", - "facebook/bart-large-cnn", - "facebook/bart-large-xsum", - "yjernite/bart_eli5", - # This is not exhaustive: see https://huggingface.co/models?filter=bart -] +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + +# See all BART models at https://huggingface.co/models?filter=bart +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json", + }, + "merges_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/bart-base": 1024, + "facebook/bart-large": 1024, + "facebook/bart-large-mnli": 1024, + "facebook/bart-large-cnn": 1024, + "facebook/bart-large-xsum": 1024, + "yjernite/bart_eli5": 1024, +} class BartTokenizer(RobertaTokenizer): @@ -42,9 +60,6 @@ class BartTokenizer(RobertaTokenizer): :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization parameters and other methods. """ - # merges and vocab same as Roberta - max_model_input_sizes = {m: 1024 for m in _all_bart_models} - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_bart_models}, - "merges_file": {m: merges_url for m in _all_bart_models}, - } + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 83fca126fa08d6..10ba84e7abc151 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -21,19 +21,44 @@ logger = logging.get_logger(__name__) -# vocab and merges same as roberta -vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" -merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" -tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" -_all_bart_models = [ - "facebook/bart-base", - "facebook/bart-large", - "facebook/bart-large-mnli", - "facebook/bart-large-cnn", - "facebook/bart-large-xsum", - "yjernite/bart_eli5", - # This is not exhaustive: see https://huggingface.co/models?filter=bart -] +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +# See all BART models at https://huggingface.co/models?filter=bart +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json", + }, + "merges_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt", + }, + "tokenizer_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/tokenizer.json", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/bart-base": 1024, + "facebook/bart-large": 1024, + "facebook/bart-large-mnli": 1024, + "facebook/bart-large-cnn": 1024, + "facebook/bart-large-xsum": 1024, + "yjernite/bart_eli5": 1024, +} class BartTokenizerFast(RobertaTokenizerFast): @@ -44,11 +69,7 @@ class BartTokenizerFast(RobertaTokenizerFast): superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the initialization parameters and other methods. """ - # merges and vocab same as Roberta - max_model_input_sizes = {m: 1024 for m in _all_bart_models} - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_bart_models}, - "merges_file": {m: merges_url for m in _all_bart_models}, - "tokenizer_file": {m: tokenizer_url for m in _all_bart_models}, - } + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES slow_tokenizer_class = BartTokenizer diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index ce672863935502..747a0b8f99fad2 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -29,7 +29,13 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} -tokenizer_url = "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model" +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert_for_seq_generation": "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512} class BertGenerationTokenizer(PreTrainedTokenizer): @@ -55,8 +61,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = {"vocab_file": {"bert_for_seq_generation": tokenizer_url}} - max_model_input_sizes = {"bert_for_seq_generation": 512} + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES prefix_tokens: List[int] = [] model_input_names = ["input_ids", "attention_mask"] diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index ea8b435683c48b..b37039ee127ef7 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -29,9 +29,18 @@ VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", - # "tokenizer_config_file": "tokenizer_config.json", + "tokenizer_config_file": "tokenizer_config.json", } -CKPT_3B = "facebook/blenderbot-3B" + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"}, + "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"}, + "tokenizer_config_file": { + "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json" + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128} class BlenderbotTokenizer(RobertaTokenizer): @@ -45,19 +54,9 @@ class BlenderbotTokenizer(RobertaTokenizer): Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning parameters. """ - vocab_files_names = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", - "tokenizer_config_file": "tokenizer_config.json", - } - pretrained_vocab_files_map = { - "vocab_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"}, - "merges_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"}, - "tokenizer_config_file": { - CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json" - }, - } - max_model_input_sizes = {"facebook/blenderbot-3B": 128} + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None): """ diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index acc2c985a5dc3d..f69e14aa25d3d1 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -33,6 +33,20 @@ "tokenizer_config_file": "tokenizer_config.json", } +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json" + }, + "merges_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt" + }, + "tokenizer_config_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json" + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512} + def get_pairs(word): """ @@ -75,23 +89,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` """ - vocab_files_names = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", - "tokenizer_config": "tokenizer_config.json", - } - pretrained_vocab_files_map = { - "vocab_file": { - "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json" - }, - "merges_file": { - "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt" - }, - "tokenizer_config_file": { - "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer.json" - }, - } - max_model_input_sizes = {"facebook/blenderbot_small-90M": 512} + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index 07d9242a90d516..c71d2229e06a18 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -24,9 +24,23 @@ logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {} +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} -PRETRAINED_VOCAB_FILES_MAP = {} +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json" + }, + "merges_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt" + }, + "tokenizer_config_file": { + "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json" + }, +} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "facebook/blenderbot_small-90M": 512, diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index 6e866ba63885f9..8901ee9a32ad50 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -39,13 +39,6 @@ "camembert-base": 512, } -SHARED_MODEL_IDENTIFIERS = [ - # Load with - # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")` - "Musixmatch/umberto-commoncrawl-cased-v1", - "Musixmatch/umberto-wikipedia-uncased-v1", -] - SPIECE_UNDERLINE = "▁" diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py index 87019e72537475..a93af73fd23fd0 100644 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ b/src/transformers/models/camembert/tokenization_camembert_fast.py @@ -48,13 +48,6 @@ "camembert-base": 512, } -SHARED_MODEL_IDENTIFIERS = [ - # Load with - # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")` - "Musixmatch/umberto-commoncrawl-cased-v1", - "Musixmatch/umberto-wikipedia-uncased-v1", -] - SPIECE_UNDERLINE = "▁" diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index 93279f0635e072..50dc80bdf46cc4 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -24,12 +24,12 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", - "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", - "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", + "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt", + "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt", + "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt", + "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", - "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", + "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt", } } diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py index d4b953b6332302..4007d4e8714fda 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py +++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -25,20 +25,20 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", - "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", - "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", + "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt", + "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt", + "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt", + "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", - "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", + "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt", }, "tokenizer_file": { - "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", - "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json", - "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json", + "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json", + "distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json", + "distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json", + "distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json", "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json", - "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json", + "distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py index 705fd064a869a9..cedfe43d21e792 100644 --- a/src/transformers/models/dpr/tokenization_dpr.py +++ b/src/transformers/models/dpr/tokenization_dpr.py @@ -30,32 +30,32 @@ CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json", }, } QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json", }, } READER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py index 12a990041a3ff7..90ab9c3f7403d4 100644 --- a/src/transformers/models/dpr/tokenization_dpr_fast.py +++ b/src/transformers/models/dpr/tokenization_dpr_fast.py @@ -31,32 +31,32 @@ CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json", }, } QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json", }, } READER_PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt", + "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt", }, "tokenizer_file": { - "facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json", + "facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index ab8606be0439b2..30d5a385b8b45b 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -36,9 +36,13 @@ } PRETRAINED_VOCAB_FILES_MAP = { - "src_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-src.json"}, - "tgt_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-tgt.json"}, - "merges_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/merges.txt"}, + "src_vocab_file": { + "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.json" + }, + "tgt_vocab_file": { + "stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.json" + }, + "merges_file": {"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txt"}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024} diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py index e98f5ff38ac52c..8beefb98a1a556 100644 --- a/src/transformers/models/herbert/tokenization_herbert_fast.py +++ b/src/transformers/models/herbert/tokenization_herbert_fast.py @@ -17,12 +17,7 @@ from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging -from .tokenization_herbert import ( - PRETRAINED_INIT_CONFIGURATION, - PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, - PRETRAINED_VOCAB_FILES_MAP, - HerbertTokenizer, -) +from .tokenization_herbert import HerbertTokenizer logger = logging.get_logger(__name__) @@ -32,6 +27,14 @@ "merges_file": "merges.txt", } +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"}, + "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514} +PRETRAINED_INIT_CONFIGURATION = {} + class HerbertTokenizerFast(PreTrainedTokenizerFast): """ diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index 1d5e2eeaa492c8..6a961c77479c14 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -25,8 +25,8 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", + "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt", + "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt", } } diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py index 00027ce11ed147..533645693e939b 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py @@ -26,12 +26,12 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", - "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", + "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt", + "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt", }, "tokenizer_file": { - "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", - "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", + "microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/tokenizer.json", + "microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py index 4aa9da74f54319..bca7b9bc8f07c6 100644 --- a/src/transformers/models/longformer/tokenization_longformer.py +++ b/src/transformers/models/longformer/tokenization_longformer.py @@ -20,17 +20,24 @@ logger = logging.get_logger(__name__) -# vocab and merges same as roberta -vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" -merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" -_all_longformer_models = [ - "allenai/longformer-base-4096", - "allenai/longformer-large-4096", - "allenai/longformer-large-4096-finetuned-triviaqa", - "allenai/longformer-base-4096-extra.pos.embd.only", - "allenai/longformer-large-4096-extra.pos.embd.only", -] - +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json", + "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json", + }, + "merge_file": { + "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt", + "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt", + }, +} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "allenai/longformer-base-4096": 4096, @@ -48,9 +55,6 @@ class LongformerTokenizer(RobertaTokenizer): :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the superclass for usage examples and documentation concerning parameters. """ - # merges and vocab same as Roberta + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_longformer_models}, - "merges_file": {m: merges_url for m in _all_longformer_models}, - } diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py index 2dea891246bc25..a25d17db7d0685 100644 --- a/src/transformers/models/longformer/tokenization_longformer_fast.py +++ b/src/transformers/models/longformer/tokenization_longformer_fast.py @@ -21,18 +21,31 @@ logger = logging.get_logger(__name__) -# vocab and merges same as roberta -vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" -merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" -tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" -_all_longformer_models = [ - "allenai/longformer-base-4096", - "allenai/longformer-large-4096", - "allenai/longformer-large-4096-finetuned-triviaqa", - "allenai/longformer-base-4096-extra.pos.embd.only", - "allenai/longformer-large-4096-extra.pos.embd.only", -] +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt", "tokenizer_file": "tokenizer.json"} +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json", + "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json", + }, + "merge_file": { + "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt", + "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt", + }, + "tokenizer_file": { + "allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/tokenizer.json", + "allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/tokenizer.json", + "allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/tokenizer.json", + "allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/tokenizer.json", + "allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/tokenizer.json", + }, +} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "allenai/longformer-base-4096": 4096, @@ -51,10 +64,7 @@ class LongformerTokenizerFast(RobertaTokenizerFast): to the superclass for usage examples and documentation concerning parameters. """ # merges and vocab same as Roberta + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_longformer_models}, - "merges_file": {m: merges_url for m in _all_longformer_models}, - "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models}, - } slow_tokenizer_class = LongformerTokenizer diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index 159e3c1b724518..75f55e5607c93d 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -16,33 +16,18 @@ from ..bert.tokenization_bert import BertTokenizer -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt", } } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "unc-nlp/lxmert-base-uncased": 512, } -#################################################### -# Mapping from model ids to a dictionary of additional -# keyword arguments for Tokenizer `__init__`. -# To be used for checkpoint specific configurations. -#################################################### + PRETRAINED_INIT_CONFIGURATION = { "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, } diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py index d2bb378544304b..9f179fb319d69b 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py +++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py @@ -17,36 +17,21 @@ from .tokenization_lxmert import LxmertTokenizer -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt", }, "tokenizer_file": { - "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/tokenizer.json", }, } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "unc-nlp/lxmert-base-uncased": 512, } -#################################################### -# Mapping from model ids to a dictionary of additional -# keyword arguments for Tokenizer `__init__`. -# To be used for checkpoint specific configurations. -#################################################### + PRETRAINED_INIT_CONFIGURATION = { "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, } diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index cd449fa84a21c3..cbd8a0aa0d8773 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -37,17 +37,21 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json", + "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/vocab.json", }, "spm_file": { "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model", + "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/sentencepiece.bpe.model", }, "tokenizer_config_file": { "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json", + "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/tokenizer_config.json", }, } -ALL_M2M100_MODELS = ["facebook/m2m100_418M", "facebook/m2m100_1.2B"] -SPM_URL = "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentence.bpe.model" +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/m2m100_418M": 1024, +} # fmt: off FAIRSEQ_LANGUAGE_CODES = ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"] @@ -96,7 +100,7 @@ class M2M100Tokenizer(PreTrainedTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES - max_model_input_sizes = {m: 1024 for m in ALL_M2M100_MODELS} + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP model_input_names = ["input_ids", "attention_mask"] diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index dadc9e2c644e5a..613b385b7799b8 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -25,7 +25,7 @@ from ...tokenization_utils import PreTrainedTokenizer -vocab_files_names = { +VOCAB_FILES_NAMES = { "source_spm": "source.spm", "target_spm": "target.spm", "vocab": "vocab.json", @@ -33,11 +33,17 @@ } PRETRAINED_VOCAB_FILES_MAP = { - "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"}, - "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"}, - "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"}, + "source_spm": { + "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spm" + }, + "target_spm": { + "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spm" + }, + "vocab": { + "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json" + }, "tokenizer_config_file": { - "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json" + "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json" }, } @@ -91,7 +97,7 @@ class MarianTokenizer(PreTrainedTokenizer): >>> outputs = model(**inputs) should work """ - vocab_files_names = vocab_files_names + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 752ff3effed755..c256132d7e73d0 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -23,8 +23,20 @@ logger = logging.get_logger(__name__) -_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"] -SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model", + "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/mbart-large-en-ro": 1024, + "facebook/mbart-large-cc25": 1024, +} FAIRSEQ_LANGUAGE_CODES = [ "ar_AR", @@ -78,9 +90,9 @@ class MBartTokenizer(XLMRobertaTokenizer): >>> inputs["labels"] = labels["input_ids"] """ - vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} - max_model_input_sizes = {m: 1024 for m in _all_mbart_models} - pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} + vocab_files_names = VOCAB_FILES_NAMES + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP prefix_tokens: List[int] = [] suffix_tokens: List[int] = [] diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py index e6d38a382185d4..be94eaa80abda9 100644 --- a/src/transformers/models/mbart/tokenization_mbart50.py +++ b/src/transformers/models/mbart/tokenization_mbart50.py @@ -30,8 +30,15 @@ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} -_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"] -SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model" +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/mbart-large-50-one-to-many-mmt": 1024, +} # fmt: off FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"] @@ -83,8 +90,8 @@ class MBart50Tokenizer(PreTrainedTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES - max_model_input_sizes = {m: 1024 for m in _all_mbart50_models} - pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}} + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP model_input_names = ["input_ids", "attention_mask"] prefix_tokens: List[int] = [] diff --git a/src/transformers/models/mbart/tokenization_mbart50_fast.py b/src/transformers/models/mbart/tokenization_mbart50_fast.py index 11b21f139e7bea..0308991de6e1ab 100644 --- a/src/transformers/models/mbart/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart50_fast.py @@ -36,9 +36,18 @@ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} -_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"] -SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model" -tokenizer_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json" +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model", + }, + "tokenizer_file": { + "facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/mbart-large-50-one-to-many-mmt": 1024, +} # fmt: off FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"] @@ -91,8 +100,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): """ vocab_files_names = VOCAB_FILES_NAMES - max_model_input_sizes = {m: 1024 for m in _all_mbart50_models} - pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}} + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = MBart50Tokenizer diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index a449895a068a91..e69021831506fc 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -32,9 +32,24 @@ logger = logging.get_logger(__name__) -_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"] -SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model" -tokenizer_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model", + "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model", + }, + "tokenizer_file": { + "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json", + "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/mbart-large-en-ro": 1024, + "facebook/mbart-large-cc25": 1024, +} FAIRSEQ_LANGUAGE_CODES = [ "ar_AR", @@ -89,9 +104,9 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): >>> inputs["labels"] = labels["input_ids"] """ - vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} - max_model_input_sizes = {m: 1024 for m in _all_mbart_models} - pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} + vocab_files_names = VOCAB_FILES_NAMES + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP slow_tokenizer_class = MBartTokenizer prefix_tokens: List[int] = [] diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 3c6ad947036845..f2000d69d713dc 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -27,28 +27,17 @@ logger = logging.get_logger(__name__) -SPIECE_UNDERLINE = "▁" +SPIECE_UNDERLINE = "▁" -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model" } } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, } diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py index f8ab110a2fed1d..d8050ec64225bb 100644 --- a/src/transformers/models/reformer/tokenization_reformer_fast.py +++ b/src/transformers/models/reformer/tokenization_reformer_fast.py @@ -32,19 +32,11 @@ logger = logging.get_logger(__name__) -SPIECE_UNDERLINE = "▁" +SPIECE_UNDERLINE = "▁" -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model" @@ -54,9 +46,6 @@ }, } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, } diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py index 4247edbba16a2c..085aafcd36249d 100644 --- a/src/transformers/models/retribert/tokenization_retribert.py +++ b/src/transformers/models/retribert/tokenization_retribert.py @@ -24,7 +24,7 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt", } } diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/retribert/tokenization_retribert_fast.py index e6e7c001b9674f..91f299b70b11e6 100644 --- a/src/transformers/models/retribert/tokenization_retribert_fast.py +++ b/src/transformers/models/retribert/tokenization_retribert_fast.py @@ -25,10 +25,10 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { - "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt", }, "tokenizer_file": { - "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", + "yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index f94460fc26fc08..9a037d1d1551a1 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -34,16 +34,16 @@ "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json", "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json", "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json", - "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json", - "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json", }, "merges_file": { "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt", "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt", "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt", "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt", - "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt", - "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt", }, } diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index fe26385c485d78..c450be4a29f0e2 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -32,24 +32,24 @@ "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json", "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json", "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json", - "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json", - "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json", }, "merges_file": { "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt", "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt", "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt", "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt", - "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt", - "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt", }, "tokenizer_file": { "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json", "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json", "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json", "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json", - "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json", - "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json", }, } diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 0619bdfad12a94..07c2fdf47b99af 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -29,16 +29,8 @@ logger = logging.get_logger(__name__) -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model", @@ -49,9 +41,6 @@ } } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index a8a9fcb2f1b5f9..10986695df68e4 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -32,16 +32,8 @@ logger = logging.get_logger(__name__) -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model ids. -#################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model", @@ -59,9 +51,6 @@ }, } -#################################################### -# Mapping from model ids to max length of inputs -#################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 28c18b093466e0..4a615742be22c8 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -37,6 +37,17 @@ "tokenizer_config_file": "tokenizer_config.json", } +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json", + }, + "tokenizer_config_file": { + "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json", + }, +} + +# Wav2Vec2 has no max input length +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize} WAV2VEC2_KWARGS_DOCSTRING = r""" padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): @@ -98,16 +109,8 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = { - "vocab_file": { - "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json" - }, - "tokenizer_config_file": { - "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json", - }, - } - # Wav2Vec2 has no max input length - max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize} + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py index 31ba6bd964cb2b..f20ec4021c150c 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py @@ -65,6 +65,8 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast): logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json", @@ -93,6 +95,7 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast): parameters. """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer @@ -109,9 +112,16 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast): logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} -PRETRAINED_VOCAB_FILES_MAP = {} +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt", + }, + "tokenizer_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json", + }, +} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "{{cookiecutter.checkpoint_identifier}}": 1024, diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py index 71e19bbd461eca..7973c1e1dd4915 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -62,6 +62,8 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer): logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json", @@ -90,6 +92,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer): parameters. """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -105,9 +108,13 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer): logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} -PRETRAINED_VOCAB_FILES_MAP = {} +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt", + }, +} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "{{cookiecutter.checkpoint_identifier}}": 1024, diff --git a/tests/test_tokenization_marian.py b/tests/test_tokenization_marian.py index b5e02fb64bd97a..d78d582f3c02d3 100644 --- a/tests/test_tokenization_marian.py +++ b/tests/test_tokenization_marian.py @@ -26,7 +26,7 @@ if is_sentencepiece_available(): - from transformers.models.marian.tokenization_marian import save_json, vocab_files_names + from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json from .test_tokenization_common import TokenizerTesterMixin @@ -50,11 +50,11 @@ def setUp(self): vocab = ["", "", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) save_dir = Path(self.tmpdirname) - save_json(vocab_tokens, save_dir / vocab_files_names["vocab"]) - save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"]) - if not (save_dir / vocab_files_names["source_spm"]).exists(): - copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"]) - copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"]) + save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"]) + save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"]) + if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists(): + copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"]) + copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"]) tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)