From f7beb773e429573004f2f76f2330e18161f805a7 Mon Sep 17 00:00:00 2001 From: "renxuancheng.rxc" Date: Sat, 13 Jan 2024 18:05:43 +0800 Subject: [PATCH 1/2] update files map --- src/transformers/models/qwen2/tokenization_qwen2.py | 6 +++--- src/transformers/models/qwen2/tokenization_qwen2_fast.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/qwen2/tokenization_qwen2.py b/src/transformers/models/qwen2/tokenization_qwen2.py index 9d7650adac3c3c..98916222e51800 100644 --- a/src/transformers/models/qwen2/tokenization_qwen2.py +++ b/src/transformers/models/qwen2/tokenization_qwen2.py @@ -34,8 +34,8 @@ } PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2-7b-beta/resolve/main/vocab.json"}, - "merges_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2-7b-beta/resolve/main/merges.txt"}, + "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, + "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, } PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" @@ -92,7 +92,7 @@ class Qwen2Tokenizer(PreTrainedTokenizer): ```python >>> from transformers import Qwen2Tokenizer - >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-7B-beta") + >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer") >>> tokenizer("Hello world")["input_ids"] [9707, 1879] diff --git a/src/transformers/models/qwen2/tokenization_qwen2_fast.py b/src/transformers/models/qwen2/tokenization_qwen2_fast.py index 09ae6ba05cf4d4..2ef20fea307bf7 100644 --- a/src/transformers/models/qwen2/tokenization_qwen2_fast.py +++ b/src/transformers/models/qwen2/tokenization_qwen2_fast.py @@ -31,9 +31,9 @@ PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/vocab.json"}, - "merges_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/merges.txt"}, - "tokenizer_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/tokenizer.json"}, + "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, + "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, + "tokenizer_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"}, } @@ -48,7 +48,7 @@ class Qwen2TokenizerFast(PreTrainedTokenizerFast): ```python >>> from transformers import Qwen2TokenizerFast - >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-7B-beta") + >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") >>> tokenizer("Hello world")["input_ids"] [9707, 1879] From 35a7480f452b61b932c63725bf2db6d10a4caec8 Mon Sep 17 00:00:00 2001 From: "renxuancheng.rxc" Date: Sat, 13 Jan 2024 18:21:19 +0800 Subject: [PATCH 2/2] update test --- tests/models/qwen2/test_tokenization_qwen2.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index 5d90a76e401078..f6e1f70297920e 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -19,7 +19,7 @@ import unittest from transformers import Qwen2Tokenizer, Qwen2TokenizerFast -from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES +from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode from transformers.testing_utils import require_tokenizers from ...test_tokenization_common import TokenizerTesterMixin @@ -84,7 +84,9 @@ def setUp(self): ";} \u010a", "\u00cf \u0135", ] - self.special_tokens_map = {"eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>"} + + # unk_token is needed, because this stub tokenizer is not complete at the byte level + self.special_tokens_map = {"eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", "unk_token": "<|unk|>"} self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) @@ -111,7 +113,7 @@ def get_input_output_texts(self, tokenizer): return input_text, output_text def test_python_full_tokenizer(self): - tokenizer = self.get_tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + tokenizer = self.get_tokenizer() sequence, _ = self.get_input_output_texts(tokenizer) bpe_tokens = [ "l",