-
Notifications
You must be signed in to change notification settings - Fork 836
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add unigram bytefallback #1217
Add unigram bytefallback #1217
Changes from all commits
044fb41
f26b0b7
f8c6c47
ac7529a
ce61a40
b327540
5e13667
e9e42e8
92b6490
0fef053
dfd36ff
c72eac1
3323956
03b5be5
c97fd26
4375f07
cc2f12d
00e3a3d
8834796
005698a
474d31e
c2881b2
ad6f524
29601fb
739790d
fa5b6a6
0684008
d5f37bd
263376f
60c2dc0
e01fbae
112d423
9320717
6476410
dc7cced
41f2a7e
1cd282b
949bcb4
03aacd8
e44b03a
ea02db0
5ad1d63
8bf84cf
6c3ea53
a04408c
7fc68a3
e1b7a33
16f9619
003d284
2451f8c
4b90431
58912da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
|
||
from tokenizers import AddedToken, Encoding, Tokenizer | ||
from tokenizers.implementations import BertWordPieceTokenizer | ||
from tokenizers.models import BPE, Model, WordPiece | ||
from tokenizers.models import BPE, Model, WordPiece, Unigram | ||
from tokenizers.normalizers import Lowercase | ||
from tokenizers.pre_tokenizers import ByteLevel | ||
from tokenizers.processors import BertProcessing, RobertaProcessing | ||
|
@@ -412,3 +412,29 @@ def test_from_pretrained_revision(self): | |
tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2") | ||
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) | ||
assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"] | ||
|
||
def test_unigram_byte_fallback(self): | ||
vocab = [ | ||
("<unk>", 0.0), | ||
("A", -0.01), | ||
("sen", -0.02), | ||
("te", -0.03), | ||
("n", -0.04), | ||
("ce", -0.05), | ||
("<0xF0>", -0.06), | ||
("<0x9F>", -0.06), | ||
("<0xA4>", -0.06), | ||
("<0x97>", -0.06), | ||
(" ", -0.4), | ||
] | ||
tokenizer = tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False)) | ||
|
||
output = tokenizer.encode("A sentence 🤗") | ||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 0] | ||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "🤗"] | ||
|
||
tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True)) | ||
|
||
output = tokenizer.encode("A sentence 🤗") | ||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9] | ||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, tokens are unimportant, only ids are. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm keeping them to understand what's going on in terms of fallback 😉 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's remove the tokens, please, only the ids matter.