File tree Expand file tree Collapse file tree 2 files changed +18
-1
lines changed Expand file tree Collapse file tree 2 files changed +18
-1
lines changed Original file line number Diff line number Diff line change @@ -2057,7 +2057,7 @@ def from_pretrained(
20572057 if "tokenizer_file" in vocab_files and not re .search (vocab_files ["tokenizer_file" ], "" .join (remote_files )):
20582058 # mistral tokenizer names are different, but we can still convert them if
20592059 # mistral common is not there
2060- other_pattern = re . escape ( "tekken.json|tokenizer.model.*" )
2060+ other_pattern = r "tekken\ .json|tokenizer\ .model\ .*"
20612061 if match := re .search (other_pattern , "\n " .join (remote_files )):
20622062 vocab_files ["vocab_file" ] = match .group ()
20632063
Original file line number Diff line number Diff line change 1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15+ import importlib
1516import json
1617import os
1718import shutil
1819import sys
1920import tempfile
2021import unittest
2122from pathlib import Path
23+ from unittest import mock
2224
2325import pytest
2426
@@ -181,6 +183,21 @@ def test_from_pretrained_use_fast_toggle(self):
181183 )
182184 self .assertIsInstance (AutoTokenizer .from_pretrained ("google-bert/bert-base-cased" ), BertTokenizerFast )
183185
186+ @require_tokenizers
187+ def test_voxtral_tokenizer_converts_from_tekken (self ):
188+ repo_id = "mistralai/Voxtral-Mini-3B-2507"
189+ tokenization_auto = transformers .models .auto .tokenization_auto
190+ with (
191+ mock .patch ("transformers.utils.import_utils.is_mistral_common_available" , return_value = False ),
192+ mock .patch ("transformers.models.auto.tokenization_auto.is_mistral_common_available" , return_value = False ),
193+ ):
194+ tokenization_auto = importlib .reload (tokenization_auto )
195+ tokenizer = tokenization_auto .AutoTokenizer .from_pretrained (repo_id ) # should not raise
196+
197+ self .assertIsInstance (tokenizer , PreTrainedTokenizerFast )
198+ self .assertTrue (tokenizer .is_fast )
199+ self .assertGreater (len (tokenizer ("Voxtral" )["input_ids" ]), 0 )
200+
184201 @require_tokenizers
185202 def test_do_lower_case (self ):
186203 tokenizer = AutoTokenizer .from_pretrained ("distilbert/distilbert-base-uncased" , do_lower_case = False )
You can’t perform that action at this time.
0 commit comments