From 6ed07ac3bcf1cb8fa5a8baf7c56238896d39ad35 Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets Date: Tue, 14 May 2024 10:18:23 +0200 Subject: [PATCH] added assertions --- tests/test_tokenization_common.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 0f04bd27d17d..9aa7698ff871 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -4196,13 +4196,22 @@ def test_split_special_tokens(self): self.assertEqual(cr_output, r_output) self.assertTrue(special_token not in p_output) - p_output_explicit = tokenizer_p.tokenize(f"Hey this is a {special_token} token", - split_special_tokens=False) + p_output_explicit = tokenizer_p.tokenize(f"Hey this is a {special_token} token", split_special_tokens=False) + r_output_explicit = tokenizer_r.tokenize(f"Hey this is a {special_token} token", split_special_tokens=False) + cr_output_explicit = tokenizer_cr.tokenize(f"Hey this is a {special_token} token", split_special_tokens=False) + self.assertTrue(special_token in p_output_explicit) + self.assertEqual(p_output_explicit, r_output_explicit) + self.assertEqual(cr_output_explicit, r_output_explicit) - special_token_id = tokenizer_r.encode(special_token, add_special_tokens=False)[0] + p_special_token_id = tokenizer_p.encode(special_token, add_special_tokens=False)[0] p_output = tokenizer_p(f"Hey this is a {special_token} token") - self.assertTrue(special_token_id not in p_output) + r_output = tokenizer_r(f"Hey this is a {special_token} token") + cr_output = tokenizer_cr(f"Hey this is a {special_token} token") + + self.assertTrue(p_special_token_id not in p_output) + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) tmpdirname = tempfile.mkdtemp() tokenizer_p.save_pretrained(tmpdirname) @@ -4211,8 +4220,7 @@ def test_split_special_tokens(self): output_reloaded = fast_from_saved.tokenize(f"Hey this is a {special_token} token") self.assertTrue(special_token not in output_reloaded) - output_explicit_reloaded = fast_from_saved.tokenize(f"Hey this is a {special_token} token", - split_special_tokens=False) + output_explicit_reloaded = fast_from_saved.tokenize(f"Hey this is a {special_token} token", split_special_tokens=False) self.assertTrue(special_token in output_explicit_reloaded) def test_added_tokens_serialization(self):