From 2530955b43c08277be00ba364d3264b8b224b813 Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 17 Mar 2021 11:35:09 -0700 Subject: [PATCH 1/2] use bisect to add one token to unique_no_split_tokens --- src/transformers/tokenization_utils.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 5ae55b80f2887b..fdd4c1c7e4b9cb 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -16,6 +16,7 @@ Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py """ +import bisect import itertools import re import unicodedata @@ -99,6 +100,19 @@ def _is_start_of_word(text): return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) +def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str): + """Inserts one token to an ordered list if it does not already exist. + Note: token_list must be sorted. + """ + insertion_idx = bisect.bisect_left(token_list, new_token) + # Checks if new_token is already in the ordered token_list + if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token: + # new_token is in token_list, don't add + return + else: + token_list.insert(insertion_idx, new_token) + + @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) class PreTrainedTokenizer(PreTrainedTokenizerBase): """ @@ -199,10 +213,16 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: - self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) + if len(new_tokens) == 1: + _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) + else: + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens - self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) + if len(tokens_to_add) == 1: + _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) + else: + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) return len(tokens_to_add) From 999a6bf352964f6f2bfc6e959c41d42d6e8eee5f Mon Sep 17 00:00:00 2001 From: Cheng Chen Date: Wed, 17 Mar 2021 20:10:19 +0000 Subject: [PATCH 2/2] fix style --- src/transformers/tokenization_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index fdd4c1c7e4b9cb..a40e64b2d44269 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -101,8 +101,8 @@ def _is_start_of_word(text): def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str): - """Inserts one token to an ordered list if it does not already exist. - Note: token_list must be sorted. + """ + Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. """ insertion_idx = bisect.bisect_left(token_list, new_token) # Checks if new_token is already in the ordered token_list