From c16e39c520df948dd184f2e63320236b856bc51b Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 15 Sep 2022 17:12:58 +0200 Subject: [PATCH] Fixing OPT fast tokenizer option. (#18753) * Fixing OPT fast tokenizer option. * Remove dependency on `pt`. * Move it to GPT2 tokenization tests. * Added a few tests. --- src/transformers/convert_slow_tokenizer.py | 16 +++++- .../models/gpt2/tokenization_gpt2_fast.py | 11 +--- tests/models/gpt2/test_tokenization_gpt2.py | 56 ++++++++++++++++++- 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 427ce3516591c3..6fbd7b49b066a0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -282,8 +282,20 @@ def converted(self) -> Tokenizer: tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() - tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) - + if self.original_tokenizer.add_bos_token: + bos = self.original_tokenizer.bos_token + bos_token_id = self.original_tokenizer.bos_token_id + tokenizer.post_processor = processors.TemplateProcessing( + single=f"{bos}:0 $A:0", # token_type_id is 2 for Funnel transformer + pair=f"{bos}:0 $A:0 $B:1", + special_tokens=[ + (bos, bos_token_id), + ], + ) + else: + # XXX trim_offsets=False actually means this post_processor doesn't + # really do anything. + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index ddd4ad56fde18a..eefd35aa94a7a9 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -146,16 +146,7 @@ def __init__( **kwargs, ) - if kwargs.pop("add_bos_token", False): - model_id = kwargs.pop("name_or_path", "") - raise ValueError( - "Currenty GPT2's fast tokenizer does NOT support adding a BOS token." - "Instead you should use GPT2's slow tokenizer class `GPT2Tokenizer` as follows: \n" - f"`GPT2Tokenizer.from_pretrained('{model_id}')`\nor\n" - f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n" - "This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005." - " so that the fast tokenizer works correctly." - ) + self.add_bos_token = kwargs.pop("add_bos_token", False) pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 3b2272d8548bc7..3273fbfce77378 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -18,7 +18,7 @@ import os import unittest -from transformers import GPT2Tokenizer, GPT2TokenizerFast +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers @@ -275,3 +275,57 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self): ] filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) + + +@require_tokenizers +class OPTTokenizationTest(unittest.TestCase): + def test_serialize_deserialize_fast_opt(self): + # More context: + # https://huggingface.co/wjmcat/opt-350m-paddle/discussions/1 + # https://huggingface.slack.com/archives/C01N44FJDHT/p1653511495183519 + # https://github.com/huggingface/transformers/pull/17088#discussion_r871246439 + + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True) + text = "A photo of a cat" + + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + tokenizer.save_pretrained("test_opt") + + tokenizer = AutoTokenizer.from_pretrained("./test_opt") + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + + def test_fast_slow_equivalence(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", use_slow=True) + text = "A photo of a cat" + + tokens_ids = tokenizer.encode( + text, + ) + # Same as above + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + + def test_users_can_modify_bos(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True) + + tokenizer.bos_token = "bos" + tokenizer.bos_token_id = tokenizer.get_vocab()["bos"] + + text = "A photo of a cat" + tokens_ids = tokenizer.encode( + text, + ) + # We changed the bos token + self.assertEqual(tokens_ids, [31957, 250, 1345, 9, 10, 4758]) + tokenizer.save_pretrained("./tok") + tokenizer = AutoTokenizer.from_pretrained("./tok") + self.assertTrue(tokenizer.is_fast) + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [31957, 250, 1345, 9, 10, 4758])