Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update qwen2 tokenization and tests #8

Merged
merged 2 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/transformers/models/qwen2/tokenization_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""


# copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
@lru_cache()
# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
Expand Down Expand Up @@ -214,6 +214,7 @@ def __init__(
)

@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
def vocab_size(self) -> int:
return len(self.encoder)

Expand Down
90 changes: 86 additions & 4 deletions tests/models/qwen2/test_tokenization_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
import os
import unittest

from transformers import Qwen2Tokenizer, Qwen2TokenizerFast
from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
from transformers.testing_utils import require_tokenizers
from transformers.testing_utils import require_tokenizers, slow

from ...test_tokenization_common import TokenizerTesterMixin

Expand All @@ -38,7 +38,12 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def setUp(self):
super().setUp()

# this make sure the vocabuary is complete at the byte level.
vocab = list(bytes_to_unicode().values())
# the vocabulary, note:
# - `"\u0120n"`, `"\u0120lowest"`, `"\u0120newer"`, and `"\u0120wider"` are ineffective, because there are
# not in the merges.
# - `"01"` is ineffective, because the merge is ineffective due to pretokenization.
vocab.extend(
[
"\u0120l",
Expand All @@ -58,6 +63,7 @@ def setUp(self):

vocab_tokens = dict(zip(vocab, range(len(vocab))))

# note: `"0 1"` is in the merges, but the pretokenization rules render it ineffective
merges = [
"#version: 0.2",
"\u0120 l",
Expand All @@ -70,7 +76,6 @@ def setUp(self):
"\u00cf \u0135",
]

# unk_token is needed, because this stub tokenizer is not complete at the byte level
self.special_tokens_map = {"eos_token": "<|endoftext|>"}

self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
Expand All @@ -92,7 +97,6 @@ def get_input_output_texts(self, tokenizer):
# this case should cover
# - NFC normalization (code point U+03D3 has different normalization forms under NFC, NFD, NFKC, and NFKD)
# - the pretokenization rules (spliting digits and merging symbols with \n\r)
# - no space added by default to the left of the special tokens in decode
input_text = "lower lower newer 010;}\n<|endoftext|>\u03d2\u0301"
output_text = "lower lower newer 010;}\n<|endoftext|>\u03d3"
return input_text, output_text
Expand Down Expand Up @@ -133,3 +137,81 @@ def test_pretokenized_inputs(self):
# which eats the whitespaces, which, in turn, is not reversible.
# the results, by nature, should be different.
pass

def test_nfc_normalization(self):
# per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms
# under NFC, NFD, NFKC, and NFKD are all different
# using these, we can make sure only NFC is applied
input_string = "\u038e\u03ab\u1e61" # the NFKC form
output_string = "\u03d3\u03d4\u1e9b" # the NFC form

if self.test_slow_tokenizer:
tokenizer = self.get_tokenizer()
tokenizer_output_string, _ = tokenizer.prepare_for_tokenization(input_string)
self.assertEqual(tokenizer_output_string, output_string)

if self.test_rust_tokenizer:
tokenizer = self.get_rust_tokenizer()
# we can check the class of the normalizer, but it would be okay if Sequence([NFKC, NFC]) is used
# let's check the output instead
tokenizer_output_string = tokenizer.backend_tokenizer.normalizer.normalize_str()
self.assertEqual(tokenizer_output_string, output_string)

def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self):
# Qwen2Tokenzier changes the default `spaces_between_special_tokens` in `decode` to False
if not self.test_slow_tokenizer:
return

# tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens`
# special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens`
# that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models
token_ids = [259, 260, 268, 269, 26]
sequence = " lower<|endoftext|><im_start>;"
sequence_with_space = " lower<|endoftext|> <im_start> ;"

tokenizer = self.get_tokenizer()
# let's add a legacy_added_tokens
im_start = AddedToken(
"<|im_start|>", single_word=False, lstrip=False, rstrip=False, special=True, normalized=False
)
tokenizer.add_tokens([im_start])

# `spaces_between_special_tokens` defaults to False
self.assertEqual(tokenizer.decode(token_ids), sequence)

# but it can be set to True
self.assertEqual(tokenizer.decode(token_ids, spaces_between_special_tokens=True), sequence_with_space)

@slow
def test_tokenizer_integration(self):
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
"models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
"🤗 Transformers 提供了可以轻松地下载并且训练先进的预训练模型的 API 和工具。使用预训练模型可以减少计算消耗和碳排放,并且节省从头训练所需要的时间和资源。",
"""```python\ntokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-tokenizer")\n"""
"""tokenizer("世界,你好!")```""",
]

# fmt: off
expected_encoding = {
'input_ids': [
[8963, 388, 320, 69514, 3881, 438, 4510, 27414, 32852, 388, 323, 4510, 27414, 21334, 35722, 1455, 529, 8, 5707, 4586, 58238, 77235, 320, 61437, 11, 479, 2828, 12, 17, 11, 11830, 61437, 64, 11, 1599, 10994, 11, 27604, 321, 33, 529, 11, 29881, 6954, 32574, 369, 18448, 11434, 45451, 320, 45, 23236, 8, 323, 18448, 11434, 23470, 320, 30042, 38, 8, 448, 916, 220, 18, 17, 10, 80669, 4119, 304, 220, 16, 15, 15, 10, 15459, 323, 5538, 94130, 2897, 1948, 619, 706, 11, 5355, 51, 21584, 323, 94986, 13],
[144834, 80532, 93685, 83744, 34187, 73670, 104261, 29490, 62189, 103937, 104034, 102830, 98841, 104034, 104949, 9370, 5333, 58143, 102011, 1773, 37029, 98841, 104034, 104949, 73670, 101940, 100768, 104997, 33108, 100912, 105054, 90395, 100136, 106831, 45181, 64355, 104034, 113521, 101975, 33108, 85329, 1773, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643],
[73594, 12669, 198, 85593, 284, 8979, 37434, 6387, 10442, 35722, 445, 48, 16948, 45274, 16948, 34841, 3135, 1138, 85593, 445, 99489, 3837, 108386, 6313, 899, 73594, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643],
],
'attention_mask': [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
],
}
# fmt: on

self.tokenizer_integration_test_util(
expected_encoding=expected_encoding,
model_name="Qwen/Qwen-tokenizer",
revision="5909c8222473b2c73b0b73fb054552cd4ef6a8eb",
sequences=sequences,
)
Loading