fix tekken pattern matching (#42363)

ArthurZucker · Cyrilvallez · commit 2a59904a7fad · 2025-11-24T18:22:14.000+01:00
* fix tekken pattern matching

* add a test

* up

* up

* style
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -2057,7 +2057,7 @@ def from_pretrained(
         if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
             # mistral tokenizer names are different, but we can still convert them if
             # mistral common is not there
-            other_pattern = re.escape("tekken.json|tokenizer.model.*")
+            other_pattern = r"tekken\.json|tokenizer\.model\.*"
             if match := re.search(other_pattern, "\n".join(remote_files)):
                 vocab_files["vocab_file"] = match.group()
 
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import json
 import os
 import shutil
 import sys
 import tempfile
 import unittest
 from pathlib import Path
+from unittest import mock
 
 import pytest
 
@@ -181,6 +183,21 @@ def test_from_pretrained_use_fast_toggle(self):
         )
         self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
 
+    @require_tokenizers
+    def test_voxtral_tokenizer_converts_from_tekken(self):
+        repo_id = "mistralai/Voxtral-Mini-3B-2507"
+        tokenization_auto = transformers.models.auto.tokenization_auto
+        with (
+            mock.patch("transformers.utils.import_utils.is_mistral_common_available", return_value=False),
+            mock.patch("transformers.models.auto.tokenization_auto.is_mistral_common_available", return_value=False),
+        ):
+            tokenization_auto = importlib.reload(tokenization_auto)
+            tokenizer = tokenization_auto.AutoTokenizer.from_pretrained(repo_id)  # should not raise
+
+        self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
+        self.assertTrue(tokenizer.is_fast)
+        self.assertGreater(len(tokenizer("Voxtral")["input_ids"]), 0)
+
     @require_tokenizers
     def test_do_lower_case(self):
         tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)