Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core Tokenization] Support a fix for spm fast models #26678

Merged
merged 62 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from 61 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
303a82c
fix
ArthurZucker Oct 3, 2023
cbf179a
Merge branch 'main' of github.com:huggingface/transformers into fix-main
ArthurZucker Oct 3, 2023
01e18db
last attempt
ArthurZucker Oct 3, 2023
08a560a
current work
ArthurZucker Oct 4, 2023
23c9513
fix forward compatibility
ArthurZucker Oct 4, 2023
0ae13ed
save all special tokens
ArthurZucker Oct 5, 2023
d887f68
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
72ff80e
current state
ArthurZucker Oct 5, 2023
b7b7d13
revert additional changes
ArthurZucker Oct 5, 2023
36d5303
updates
ArthurZucker Oct 5, 2023
ae93856
remove tokenizer.model
ArthurZucker Oct 5, 2023
88ea352
add a test and the fix
ArthurZucker Oct 5, 2023
ca98fbd
nit
ArthurZucker Oct 5, 2023
3c22fbb
revert one more break
ArthurZucker Oct 5, 2023
dc93d5e
fix typefield issue
ArthurZucker Oct 5, 2023
00997e9
quality
ArthurZucker Oct 5, 2023
6143634
more tests
ArthurZucker Oct 5, 2023
907591f
fix fields for FC
ArthurZucker Oct 5, 2023
5df5a83
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
66ecb9e
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 5, 2023
0e7bd61
more nits?
ArthurZucker Oct 5, 2023
381a0ec
Merge branch 'fix-main' of github.com:ArthurZucker/transformers into …
ArthurZucker Oct 6, 2023
bf75334
new additional changes
ArthurZucker Oct 6, 2023
fafbbed
how
ArthurZucker Oct 6, 2023
c6de7b2
some updates
ArthurZucker Oct 6, 2023
104b03a
the fix
ArthurZucker Oct 7, 2023
c2b03c1
where do we stand
ArthurZucker Oct 7, 2023
a088e20
Merge branch 'main' of github.com:huggingface/transformers into fix-s…
ArthurZucker Oct 26, 2023
12d8d9a
Merge branch 'main' of github.com:huggingface/transformers into fix-s…
ArthurZucker Nov 15, 2023
332ecb5
nits
ArthurZucker Nov 15, 2023
2fb4513
nits
ArthurZucker Nov 15, 2023
1d3f38c
revert unrelated changes
ArthurZucker Nov 15, 2023
5045c88
nits nits nits
ArthurZucker Nov 15, 2023
f173225
styling
ArthurZucker Nov 15, 2023
2947e2b
don't break llama just yet
ArthurZucker Nov 15, 2023
f62694c
revert llama changes
ArthurZucker Nov 15, 2023
2ec474b
safe arg check
ArthurZucker Nov 15, 2023
1062897
fixup
ArthurZucker Nov 15, 2023
ace0d0f
Add a test for T5
ArthurZucker Nov 16, 2023
1f621b2
Necessary changes
ArthurZucker Nov 16, 2023
20ca1be
Tests passing, added tokens need to not be normalized. If the added t…
ArthurZucker Nov 16, 2023
080186c
Add even more tests, when normalization is set to True (which does no…
ArthurZucker Nov 16, 2023
6388dd8
Merge branch 'main' of github.com:huggingface/transformers into fix-s…
ArthurZucker Nov 16, 2023
6884ee4
Add even more tests, when normalization is set to True (which does no…
ArthurZucker Nov 16, 2023
31d34bd
Update to main
ArthurZucker Nov 17, 2023
ca2bfb5
Merge branch 'main' of github.com:huggingface/transformers into fix-s…
ArthurZucker Dec 13, 2023
79e51d2
nits
ArthurZucker Dec 13, 2023
efdd11d
fmt
ArthurZucker Dec 13, 2023
575bd5a
more and more test
ArthurZucker Dec 13, 2023
cfd3e8d
comments
ArthurZucker Dec 13, 2023
06260b1
Merge branch 'main' of github.com:huggingface/transformers into fix-s…
ArthurZucker Jan 17, 2024
8f309e5
revert change as tests are failing
ArthurZucker Jan 18, 2024
084c279
make the test more readble
ArthurZucker Jan 18, 2024
7fdee17
nits
ArthurZucker Jan 18, 2024
6f476cc
refactor the test
ArthurZucker Jan 18, 2024
1c3f477
nit
ArthurZucker Jan 18, 2024
305e52c
updates
ArthurZucker Jan 18, 2024
714dd61
simplify
ArthurZucker Jan 18, 2024
18e0d4a
style
ArthurZucker Jan 18, 2024
fa87e6c
style
ArthurZucker Jan 18, 2024
49ac0cb
style convert slow
ArthurZucker Jan 18, 2024
a709039
Update src/transformers/convert_slow_tokenizer.py
ArthurZucker Jan 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Dict, List, Tuple

from packaging import version

ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece

Expand Down Expand Up @@ -552,15 +553,22 @@ def tokenizer(self, proto):

def normalizer(self, proto):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
_normalizers = [
normalizers.Strip(left=False, right=True), # stripping is important
normalizers.Replace(Regex(" {2,}"), "▁"),
]
if not precompiled_charsmap:
return normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
return normalizers.Sequence(_normalizers)
else:
return normalizers.Sequence(
[normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " ")]
)
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)

def pre_tokenizer(self, replacement, add_prefix_space):
return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
prepend_scheme = "always"
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
prepend_scheme = "first"
return pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
)

def post_processor(self):
return None
Expand Down
35 changes: 35 additions & 0 deletions tests/models/t5/test_tokenization_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,41 @@ def test_some_edge_cases(self):
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))

def test_fast_slow_edge_cases(self):
# We are testing spaces before and spaces after special tokens + space transformations
slow_tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
slow_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))

edge_case = "Hey!<new_token_test_>. How</s>Hey <new_token_test_>!"
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "<new_token_test_>", "!"] # fmt: skip
with self.subTest(f"slow {edge_case} normalized = False"):
self.assertEqual(slow_tokenizer.tokenize(edge_case), EXPECTED_SLOW)
with self.subTest(f"Fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_SLOW)

hard_case = "Hey! <new_token_test_>. How</s> Hey <new_token_test_> ! . "
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "▁Hey", "<new_token_test_>", "▁", "!", "▁", "."] # fmt: skip
with self.subTest(f"slow {edge_case} normalized = False"):
self.assertEqual(slow_tokenizer.tokenize(hard_case), EXPECTED_SLOW)
with self.subTest(f"fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_SLOW)

fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=True))

# `normalized=True` is the default normalization scheme when adding a token. Normalize -> don't strip the space.
# the issue now is that our slow tokenizer should NOT strip the space if we want to simulate sentencepiece token addition.

EXPECTED_FAST = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "▁", "<new_token_test_>", "!"] # fmt: skip
with self.subTest(f"fast {edge_case} normalized = True"):
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_FAST)

EXPECTED_FAST = ['▁Hey', '!', '▁', '<new_token_test_>', '.', '▁How', '</s>', '▁Hey','▁', '<new_token_test_>', '▁', '!', '▁', '.'] # fmt: skip
with self.subTest(f"fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_FAST)


@require_sentencepiece
@require_tokenizers
Expand Down
Loading