Skip to content

Commit 2a59904

Browse files
ArthurZuckerCyrilvallez
authored andcommitted
fix tekken pattern matching (#42363)
* fix tekken pattern matching * add a test * up * up * style
1 parent 7e66db7 commit 2a59904

File tree

2 files changed

+18
-1
lines changed

2 files changed

+18
-1
lines changed

src/transformers/tokenization_utils_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2057,7 +2057,7 @@ def from_pretrained(
20572057
if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
20582058
# mistral tokenizer names are different, but we can still convert them if
20592059
# mistral common is not there
2060-
other_pattern = re.escape("tekken.json|tokenizer.model.*")
2060+
other_pattern = r"tekken\.json|tokenizer\.model\.*"
20612061
if match := re.search(other_pattern, "\n".join(remote_files)):
20622062
vocab_files["vocab_file"] = match.group()
20632063

tests/models/auto/test_tokenization_auto.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import importlib
1516
import json
1617
import os
1718
import shutil
1819
import sys
1920
import tempfile
2021
import unittest
2122
from pathlib import Path
23+
from unittest import mock
2224

2325
import pytest
2426

@@ -181,6 +183,21 @@ def test_from_pretrained_use_fast_toggle(self):
181183
)
182184
self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
183185

186+
@require_tokenizers
187+
def test_voxtral_tokenizer_converts_from_tekken(self):
188+
repo_id = "mistralai/Voxtral-Mini-3B-2507"
189+
tokenization_auto = transformers.models.auto.tokenization_auto
190+
with (
191+
mock.patch("transformers.utils.import_utils.is_mistral_common_available", return_value=False),
192+
mock.patch("transformers.models.auto.tokenization_auto.is_mistral_common_available", return_value=False),
193+
):
194+
tokenization_auto = importlib.reload(tokenization_auto)
195+
tokenizer = tokenization_auto.AutoTokenizer.from_pretrained(repo_id) # should not raise
196+
197+
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
198+
self.assertTrue(tokenizer.is_fast)
199+
self.assertGreater(len(tokenizer("Voxtral")["input_ids"]), 0)
200+
184201
@require_tokenizers
185202
def test_do_lower_case(self):
186203
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)

0 commit comments

Comments
 (0)