Skip to content

Commit

Permalink
Merge pull request #5 from jklj077/patch-4
Browse files Browse the repository at this point in the history
Update files map so that tests can find he pretrained tokenizers
  • Loading branch information
JustinLin610 authored Jan 13, 2024
2 parents 985fe0f + 35a7480 commit 26a24db
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 10 deletions.
6 changes: 3 additions & 3 deletions src/transformers/models/qwen2/tokenization_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2-7b-beta/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2-7b-beta/resolve/main/merges.txt"},
"vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
}

PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
Expand Down Expand Up @@ -92,7 +92,7 @@ class Qwen2Tokenizer(PreTrainedTokenizer):
```python
>>> from transformers import Qwen2Tokenizer
>>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-7B-beta")
>>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/qwen2/tokenization_qwen2_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@


PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/merges.txt"},
"tokenizer_file": {"qwen/qwen2": "https://huggingface.co/qwen/qwen2/resolve/main/tokenizer.json"},
"vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
"merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
"tokenizer_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"},
}


Expand All @@ -48,7 +48,7 @@ class Qwen2TokenizerFast(PreTrainedTokenizerFast):
```python
>>> from transformers import Qwen2TokenizerFast
>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-7B-beta")
>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
>>> tokenizer("Hello world")["input_ids"]
[9707, 1879]
Expand Down
8 changes: 5 additions & 3 deletions tests/models/qwen2/test_tokenization_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import unittest

from transformers import Qwen2Tokenizer, Qwen2TokenizerFast
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
from transformers.testing_utils import require_tokenizers

from ...test_tokenization_common import TokenizerTesterMixin
Expand Down Expand Up @@ -84,7 +84,9 @@ def setUp(self):
";} \u010a",
"\u00cf \u0135",
]
self.special_tokens_map = {"eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}

# unk_token is needed, because this stub tokenizer is not complete at the byte level
self.special_tokens_map = {"eos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", "unk_token": "<|unk|>"}

self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
Expand All @@ -111,7 +113,7 @@ def get_input_output_texts(self, tokenizer):
return input_text, output_text

def test_python_full_tokenizer(self):
tokenizer = self.get_tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
tokenizer = self.get_tokenizer()
sequence, _ = self.get_input_output_texts(tokenizer)
bpe_tokens = [
"l",
Expand Down

0 comments on commit 26a24db

Please sign in to comment.