Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tokenization_qwen2.py #3

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions src/transformers/models/qwen2/tokenization_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,6 @@ class Qwen2Tokenizer(PreTrainedTokenizer):
to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
'|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
chat_template (`str`, *optional*):
A Jinja template string that will be used to format lists of chat messages. See
https://huggingface.co/docs/transformers/chat_templating for a full description.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -159,7 +156,6 @@ def __init__(
pad_token="<|endoftext|>",
clean_up_tokenization_spaces=False,
split_special_tokens=False,
chat_template=CHAT_TEMPLATE,
**kwargs,
):
# Qwen vocab does not contain control tokens; added tokens need to be special
Expand Down Expand Up @@ -204,12 +200,15 @@ def __init__(

self.pat = re.compile(PRETOKENIZE_REGEX)

if "chat_template" not in kwargs:
# if not specified, Qwen2 models should default to the CHATML template
kwargs["chat_template"] = CHAT_TEMPLATE

super().__init__(
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
unk_token=unk_token,
chat_template=chat_template,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
split_special_tokens=split_special_tokens,
**kwargs,
Expand Down
Loading