Skip to content

Commit

Permalink
[Core Tokenization] Support a fix for spm fast models (huggingface#…
Browse files Browse the repository at this point in the history
…26678)

* fix

* last attempt

* current work

* fix forward compatibility

* save all special tokens

* current state

* revert additional changes

* updates

* remove tokenizer.model

* add a test and the fix

* nit

* revert one more break

* fix typefield issue

* quality

* more tests

* fix fields for FC

* more nits?

* new additional changes

* how

* some updates

* the fix

* where do we stand

* nits

* nits

* revert unrelated changes

* nits nits nits

* styling

* don't break llama just yet

* revert llama changes

* safe arg check

* fixup

* Add a test for T5

* Necessary changes

* Tests passing, added tokens need to not be normalized. If the added tokens are normalized, it will the stripping which seems to be unwanted for a normal functioning

* Add even more tests, when normalization is set to True (which does not work 😓 )

* Add even more tests, when normalization is set to True (which does not work 😓 )

* Update to main

* nits

* fmt

* more and more test

* comments

* revert change as tests are failing

* make the test more readble

* nits

* refactor the test

* nit

* updates

* simplify

* style

* style

* style convert slow

* Update src/transformers/convert_slow_tokenizer.py
  • Loading branch information
ArthurZucker authored and wgifford committed Jan 21, 2024
1 parent a0e20f0 commit d231358
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 5 deletions.
17 changes: 12 additions & 5 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,15 +552,22 @@ def tokenizer(self, proto):

def normalizer(self, proto):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
_normalizers = [
normalizers.Strip(left=False, right=True), # stripping is important
normalizers.Replace(Regex(" {2,}"), "▁"),
]
if not precompiled_charsmap:
return normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
return normalizers.Sequence(_normalizers)
else:
return normalizers.Sequence(
[normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " ")]
)
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)

def pre_tokenizer(self, replacement, add_prefix_space):
return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
prepend_scheme = "always"
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
prepend_scheme = "first"
return pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
)

def post_processor(self):
return None
Expand Down
35 changes: 35 additions & 0 deletions tests/models/t5/test_tokenization_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,41 @@ def test_some_edge_cases(self):
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))

def test_fast_slow_edge_cases(self):
# We are testing spaces before and spaces after special tokens + space transformations
slow_tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
slow_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))

edge_case = "Hey!<new_token_test_>. How</s>Hey <new_token_test_>!"
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "<new_token_test_>", "!"] # fmt: skip
with self.subTest(f"slow {edge_case} normalized = False"):
self.assertEqual(slow_tokenizer.tokenize(edge_case), EXPECTED_SLOW)
with self.subTest(f"Fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_SLOW)

hard_case = "Hey! <new_token_test_>. How</s> Hey <new_token_test_> ! . "
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "▁Hey", "<new_token_test_>", "▁", "!", "▁", "."] # fmt: skip
with self.subTest(f"slow {edge_case} normalized = False"):
self.assertEqual(slow_tokenizer.tokenize(hard_case), EXPECTED_SLOW)
with self.subTest(f"fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_SLOW)

fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=True))

# `normalized=True` is the default normalization scheme when adding a token. Normalize -> don't strip the space.
# the issue now is that our slow tokenizer should NOT strip the space if we want to simulate sentencepiece token addition.

EXPECTED_FAST = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "▁", "<new_token_test_>", "!"] # fmt: skip
with self.subTest(f"fast {edge_case} normalized = True"):
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_FAST)

EXPECTED_FAST = ['▁Hey', '!', '▁', '<new_token_test_>', '.', '▁How', '</s>', '▁Hey','▁', '<new_token_test_>', '▁', '!', '▁', '.'] # fmt: skip
with self.subTest(f"fast {edge_case} normalized = False"):
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_FAST)


@require_sentencepiece
@require_tokenizers
Expand Down

0 comments on commit d231358

Please sign in to comment.