diff --git a/tests/byt5/test_tokenization_byt5.py b/tests/byt5/test_tokenization_byt5.py index afdcae0ee38955..eb210530f0f36c 100644 --- a/tests/byt5/test_tokenization_byt5.py +++ b/tests/byt5/test_tokenization_byt5.py @@ -321,3 +321,14 @@ def test_pretokenized_inputs(self): # tests all ids in vocab => vocab doesn't exist so unnecessary to test def test_conversion_reversible(self): pass + + def test_convert_tokens_to_string_format(self): + # The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings + # and special added tokens as tokens + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "x", "t", ""] + string = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(string, str) diff --git a/tests/perceiver/test_tokenization_perceiver.py b/tests/perceiver/test_tokenization_perceiver.py index 214e6aff38e9ff..0b6b7d4c75a8b2 100644 --- a/tests/perceiver/test_tokenization_perceiver.py +++ b/tests/perceiver/test_tokenization_perceiver.py @@ -286,3 +286,14 @@ def test_pretokenized_inputs(self): # tests all ids in vocab => vocab doesn't exist so unnecessary to test def test_conversion_reversible(self): pass + + def test_convert_tokens_to_string_format(self): + # The default common tokenizer tests uses invalid tokens for Perceiver that can only accept one-character + # strings and special added tokens as tokens + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["[CLS]", "t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "s", "t", "[SEP]"] + string = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(string, str) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index f260fa71fff190..2d26d76b9a089b 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3713,6 +3713,15 @@ def test_saving_tokenizer_trainer(self): trainer.save_model(os.path.join(tmp_dir, "checkpoint")) self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint"))) + def test_convert_tokens_to_string_format(self): + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["this", "is", "a", "test"] + string = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(string, str) + def test_save_slow_from_fast_and_reload_fast(self): if not self.test_slow_tokenizer or not self.test_rust_tokenizer: # we need both slow and fast versions diff --git a/tests/wav2vec2/test_tokenization_wav2vec2.py b/tests/wav2vec2/test_tokenization_wav2vec2.py index 98c6f126bbfbb2..775b3916e7a650 100644 --- a/tests/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/wav2vec2/test_tokenization_wav2vec2.py @@ -753,3 +753,14 @@ def test_tf_encode_plus_sent_to_model(self): @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.") def test_torch_encode_plus_sent_to_model(self): pass + + def test_convert_tokens_to_string_format(self): + # The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which + # is not the case for Wav2vec2. + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["T", "H", "I", "S", "|", "I", "S", "|", "A", "|", "T", "E", "X", "T"] + output = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(output["text"], str) diff --git a/tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py index 73f47010b777a2..24582cefbbd9d9 100644 --- a/tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py +++ b/tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py @@ -398,3 +398,14 @@ def test_tf_encode_plus_sent_to_model(self): @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.") def test_torch_encode_plus_sent_to_model(self): pass + + def test_convert_tokens_to_string_format(self): + # The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which + # is not the case for Wav2Vec2PhonemeCTCTokenizer. + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["ð", "ɪ", "s", "ɪ", "z", "ɐ", "t", "ɛ", "k", "s", "t"] + output = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(output["text"], str)