From 17b233ba78521a76e95818b29b2b6071759dbd8a Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 16 Aug 2023 15:28:45 +0000 Subject: [PATCH 1/5] properly support Sequence of pretokenizers --- .../models/bloom/tokenization_bloom_fast.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index d20065c90338c0..df254cc9ffad88 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -131,10 +131,16 @@ def __init__( **kwargs, ) pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) - if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: - pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) - pre_tok_state["add_prefix_space"] = add_prefix_space - self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + if pre_tok_class == "Sequence": + for pre_tok in pre_tok_state.get("pretokenizers"): + if hasattr(pre_tok, "add_prefix_space"): + pre_tok["add_prefix_space"] = add_prefix_space + elif hasattr(pre_tok, "add_prefix_space"): + pre_tok["add_prefix_space"] = add_prefix_space + + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) self.add_prefix_space = add_prefix_space From 00138c34e9b137e5cc278b6ad4d85788eac1f570 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 16 Aug 2023 16:21:54 +0000 Subject: [PATCH 2/5] actual fix --- .../models/bloom/tokenization_bloom_fast.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index df254cc9ffad88..27d1fcb0f6564c 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -131,17 +131,15 @@ def __init__( **kwargs, ) pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) - - pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) - if pre_tok_class == "Sequence": - for pre_tok in pre_tok_state.get("pretokenizers"): - if hasattr(pre_tok, "add_prefix_space"): - pre_tok["add_prefix_space"] = add_prefix_space - elif hasattr(pre_tok, "add_prefix_space"): - pre_tok["add_prefix_space"] = add_prefix_space - - self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) - + pre_tokenizers = pre_tok_state.get("pretokenizers", pre_tok_state) + for pre_tok_idx in range(len(pre_tokenizers)): + if not hasattr(pre_tok_state, "pretokenizers"): + pre_tok_state["add_prefix_space"] = add_prefix_space + if hasattr(pre_tok_state["pretokenizers"][pre_tok_idx], "add_prefix_space"): + pre_tok_state["pretokenizers"][pre_tok_idx]["add_prefix_space"] = add_prefix_space + + new_pre_tok_state = json.dumps(pre_tok_state).encode() + self.backend_tokenizer.pre_tokenizer.__setstate__(new_pre_tok_state) self.add_prefix_space = add_prefix_space def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: From ae9c0d8999d3540b8547f5070bc95cd7b2e107d2 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 17 Aug 2023 09:49:22 +0000 Subject: [PATCH 3/5] make sure the fix works. Tests are not working for sure! --- .../models/bloom/tokenization_bloom_fast.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 27d1fcb0f6564c..ec7ac2cb7d6d9e 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -15,11 +15,9 @@ """Tokenization classes for Bloom.""" -import json +import pickle from typing import TYPE_CHECKING, List, Optional, Tuple -from tokenizers import pre_tokenizers - from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging @@ -130,16 +128,12 @@ def __init__( clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) - pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) - pre_tokenizers = pre_tok_state.get("pretokenizers", pre_tok_state) - for pre_tok_idx in range(len(pre_tokenizers)): - if not hasattr(pre_tok_state, "pretokenizers"): - pre_tok_state["add_prefix_space"] = add_prefix_space - if hasattr(pre_tok_state["pretokenizers"][pre_tok_idx], "add_prefix_space"): - pre_tok_state["pretokenizers"][pre_tok_idx]["add_prefix_space"] = add_prefix_space - - new_pre_tok_state = json.dumps(pre_tok_state).encode() - self.backend_tokenizer.pre_tokenizer.__setstate__(new_pre_tok_state) + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + if add_prefix_space: + pre_tok_state.replace(b'"add_prefix_space":false',b'"add_prefix_space":true') + else: + pre_tok_state.replace(b'"add_prefix_space":true',b'"add_prefix_space":false') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) self.add_prefix_space = add_prefix_space def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: From 963158228acf21882a529307517386652236b515 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 17 Aug 2023 10:18:13 +0000 Subject: [PATCH 4/5] hacky way --- src/transformers/models/bloom/tokenization_bloom_fast.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index ec7ac2cb7d6d9e..2d5776b238b1b2 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -128,11 +128,10 @@ def __init__( clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) + # overwride add_prefix_space pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) if add_prefix_space: - pre_tok_state.replace(b'"add_prefix_space":false',b'"add_prefix_space":true') - else: - pre_tok_state.replace(b'"add_prefix_space":true',b'"add_prefix_space":false') + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false',b'"add_prefix_space": true') self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) self.add_prefix_space = add_prefix_space From 6affdd75e90583a7190d4519a52962a186ce2e9e Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 17 Aug 2023 10:21:53 +0000 Subject: [PATCH 5/5] add TODO --- src/transformers/models/bloom/tokenization_bloom_fast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 2d5776b238b1b2..b60917c71a042c 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -132,6 +132,7 @@ def __init__( pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) if add_prefix_space: pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false',b'"add_prefix_space": true') + # TODO decoder should also be udpated to reflect this self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) self.add_prefix_space = add_prefix_space