huggingface · sgugger · Jul 17, 2021 · Jul 8, 2021 · Jul 9, 2021 · Jul 9, 2021
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -132,7 +132,7 @@ def __init__(
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                "model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                "model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                "model use `tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                "model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+                "model use `tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -111,6 +111,7 @@ class EncodingFast:
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
 ADDED_TOKENS_FILE = "added_tokens.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+CONFIG_FILE = "config.json"
 
 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 FULL_TOKENIZER_FILE = "tokenizer.json"
@@ -1639,6 +1640,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                 "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                 "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
                 "tokenizer_file": FULL_TOKENIZER_FILE,
+                "config_file": CONFIG_FILE,
             }
             # Look for the tokenizer files
             for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
@@ -1742,16 +1744,34 @@ def _from_pretrained(
         # Prepare tokenizer initialization kwargs
         # Did we saved some inputs and kwargs to reload ?
         tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+        config_tokenizer_class = None
         if tokenizer_config_file is not None:
             with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                 init_kwargs = json.load(tokenizer_config_handle)
+            config_tokenizer_class = init_kwargs.get("tokenizer_class")
             init_kwargs.pop("tokenizer_class", None)
             saved_init_inputs = init_kwargs.pop("init_inputs", ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
         else:
             init_kwargs = init_configuration
 
+        if tokenizer_config_file is None or config_tokenizer_class is None:
+            config_file = resolved_vocab_files.pop("config_file", None)
+            if config_file is not None:
+                with open(config_file, encoding="utf-8") as config_handle:
+                    config_dict = json.load(config_handle)
+                config_tokenizer_class = config_dict.get("tokenizer_class")
+
+        if config_tokenizer_class is not None:
+            if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
+                raise ValueError(
+                    "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. "
+                    "It may result in unexpected tokenization. \n"
+                    f"The tokenizer class you load from this checkpoint is '{config_tokenizer_class}'. \n"
+                    f"The class this function is called from is '{cls.__name__}'."
+                )
+
         # Update with newly provided kwargs
         init_kwargs.update(kwargs)
 

diff --git a/tests/test_tokenization_base.py b/tests/test_tokenization_base.py
@@ -0,0 +1,17 @@
+import unittest
+
+from transformers.models.bert.tokenization_bert import BertTokenizer
+from transformers.models.bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
+
+
+class ClassMismatchTest(unittest.TestCase):
+    def test_mismatch_error(self):
+        PRETRAINED_MODEL = "cl-tohoku/bert-base-japanese"
+        with self.assertRaises(ValueError):
+            BertTokenizer.from_pretrained(PRETRAINED_MODEL)
+
+    def test_limit_of_match_validation(self):
+        # Can't detect mismatch because this model's config
+        # doesn't have information about the tokenizer model.
+        PRETRAINED_MODEL = "bert-base-uncased"
+        BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL)