Merge pull request #9 from jklj077/patch-7

update tests
JustinLin610 · Jan 15, 2024 · ca4a8c5 · ca4a8c5
2 parents 7b670a8 + 8b144e1
commit ca4a8c5
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py
@@ -142,7 +142,7 @@ def test_nfc_normalization(self):
         # per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms
         # under NFC, NFD, NFKC, and NFKD are all different
         # using these, we can make sure only NFC is applied
-        input_string = "\u038e\u03ab\u1e61"  # the NFKC form
+        input_string = "\u03d2\u0301\u03d2\u0308\u017f\u0307"  # the NFD form
         output_string = "\u03d3\u03d4\u1e9b"  # the NFC form
 
         if self.test_slow_tokenizer:
@@ -166,8 +166,8 @@ def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self):
         # special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens`
         # that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models
         token_ids = [259, 260, 268, 269, 26]
-        sequence = " lower<|endoftext|><im_start>;"
-        sequence_with_space = " lower<|endoftext|> <im_start> ;"
+        sequence = " lower<|endoftext|><|im_start|>;"
+        sequence_with_space = " lower<|endoftext|> <|im_start|> ;"
 
         tokenizer = self.get_tokenizer()
         # let's add a legacy_added_tokens