From 8efcd1307f4857bf991b781a9a41dd4e89441e20 Mon Sep 17 00:00:00 2001 From: rchan Date: Fri, 7 Oct 2022 17:08:26 +0100 Subject: [PATCH 1/4] Remove dependency of Roberta in Blenderbot --- .../blenderbot/tokenization_blenderbot.py | 382 ++++++++++++++++-- .../tokenization_blenderbot_fast.py | 254 ++++++++++-- 2 files changed, 572 insertions(+), 64 deletions(-) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 8fabbbf6f24a56..865a0302ceef35 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -14,10 +14,18 @@ # limitations under the License. """Tokenization class for Blenderbot.""" -from typing import TYPE_CHECKING, List, Optional +import json +import os +from functools import lru_cache +from typing import TYPE_CHECKING, List, Optional, Tuple +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import logging -from ..roberta.tokenization_roberta import RobertaTokenizer + + +# from ..roberta.tokenization_roberta import RobertaTokenizer if TYPE_CHECKING: @@ -43,53 +51,365 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128} -class BlenderbotTokenizer(RobertaTokenizer): - r""" - Construct a Blenderbot tokenizer. +@lru_cache() +# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +# Copied from transformers.models.roberta.tokenization_roberta.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer with roberta-base->facebook/blenderbot-3B, Roberta->Blenderbot, RoBERTa->Blenderbot +class BlenderbotTokenizer(PreTrainedTokenizer): + """ + Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ``` + >>> from transformers import BlenderbotTokenizer + >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") + >>> tokenizer("Hello world")['input_ids'] + [0, 31414, 232, 328, 2] + >>> tokenizer(" Hello world")['input_ids'] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. - [`Blenderbot`] is nearly identical to [`RobertaTokenizer`] and runs end-to-end tokenization: punctuation splitting - and wordpiece. The only difference is that it doesn't add BOS token to the beginning of sequences. + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - Refer to superclass [`RobertaTokenizer`] for usage examples and documentation concerning parameters. + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Blenderbot tokenizer detect beginning of words by the preceding space). """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A Blenderbot sequence has the following format: - - single sequence: ` X ` + - single sequence: ` X ` + - pair of sequences: ` A B ` Args: token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added + List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): - Will be ignored + Optional second list of IDs for sequence pairs. Returns: - `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ - return token_ids_0 + [self.eos_token_id] - - def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: - inputs = [] - for is_user, text in conversation.iter_texts(): - if is_user: - # We need to space prefix as it's being done within blenderbot - inputs.append(" " + text) - else: - # Generated responses should contain them already. - inputs.append(text) - - full_string = " ".join(inputs) - input_ids = self.encode(full_string) - if len(input_ids) > self.model_max_length: - input_ids = input_ids[-self.model_max_length :] - logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.") - return input_ids + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) def get_pairs(word): diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 258068b9c343f9..a72425a3858a60 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -13,11 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fast Tokenization class for Blenderbot.""" +import json +from typing import TYPE_CHECKING, List, Optional, Tuple -from typing import TYPE_CHECKING, List, Optional +from tokenizers import pre_tokenizers, processors +from ...tokenization_utils_base import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging -from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast from .tokenization_blenderbot import BlenderbotTokenizer @@ -44,51 +47,236 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128} -class BlenderbotTokenizerFast(RobertaTokenizerFast): - r""" - Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library). +# Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast with roberta-base->facebook/blenderbot-3B, Roberta->Blenderbot, RoBERTa->Blenderbot +class BlenderbotTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 + tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ``` + >>> from transformers import BlenderbotTokenizerFast + >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") + >>> tokenizer("Hello world")['input_ids'] + [0, 31414, 232, 328, 2] + >>> tokenizer(" Hello world")['input_ids'] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. - [`BlenderbotFast`] is nearly identical to [`RobertaTokenizerFast`] and runs end-to-end tokenization: punctuation - splitting and wordpiece. The only difference is that it doesn't add BOS token to the beginning of sequences. + - Refer to superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning parameters. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Blenderbot tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether the post processing step should trim offsets to avoid including whitespaces. """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = BlenderbotTokenizer - def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + trim_offsets=True, + **kwargs + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + **kwargs, + ) + + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + + self.add_prefix_space = add_prefix_space + + tokenizer_component = "post_processor" + tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) + if tokenizer_component_instance: + state = json.loads(tokenizer_component_instance.__getstate__()) + + # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` + if "sep" in state: + state["sep"] = tuple(state["sep"]) + if "cls" in state: + state["cls"] = tuple(state["cls"]) + + changes_to_apply = False + + if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + state["add_prefix_space"] = add_prefix_space + changes_to_apply = True + + if state.get("trim_offsets", trim_offsets) != trim_offsets: + state["trim_offsets"] = trim_offsets + changes_to_apply = True + + if changes_to_apply: + component_class = getattr(processors, state.pop("type")) + new_value = component_class(**state) + setattr(self.backend_tokenizer, tokenizer_component, new_value) + + @property + def mask_token(self) -> str: + """ + `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not + having been set. + + Blenderbot tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will + greedily comprise the space before the **. + """ + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ + Overriding the default behavior of the mask token to have it eat the space before it. + + This is needed to preserve backward compatibility with all the previously used models based on Blenderbot. """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Blenderbot sequence has the following format: + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value - - single sequence: ` X ` + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does + not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added + List of IDs. token_ids_1 (`List[int]`, *optional*): - Will be ignored + Optional second list of IDs for sequence pairs. Returns: - `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + `List[int]`: List of zeros. """ - return token_ids_0 + [self.eos_token_id] - - def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: - inputs = [] - for is_user, text in conversation.iter_texts(): - if is_user: - # We need to space prefix as it's being done within blenderbot - inputs.append(" " + text) - else: - # Generated responses should contain them already. - inputs.append(text) - - full_string = " ".join(inputs) - input_ids = self.encode(full_string) - if len(input_ids) > self.model_max_length: - input_ids = input_ids[-self.model_max_length :] - logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.") - return input_ids + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] From 0182c43372424f13cfafff49507762f794614d6d Mon Sep 17 00:00:00 2001 From: rchan Date: Mon, 10 Oct 2022 12:51:03 +0100 Subject: [PATCH 2/4] Move Copied from statements to each method of the Roberta classes --- .../blenderbot/tokenization_blenderbot.py | 87 +++++++++---------- .../tokenization_blenderbot_fast.py | 49 +++++++++-- 2 files changed, 83 insertions(+), 53 deletions(-) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 865a0302ceef35..e7641dbf5b9ca9 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -25,9 +25,6 @@ from ...utils import logging -# from ..roberta.tokenization_roberta import RobertaTokenizer - - if TYPE_CHECKING: from transformers.pipelines.conversational import Conversation @@ -92,7 +89,6 @@ def get_pairs(word): return pairs -# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer with roberta-base->facebook/blenderbot-3B, Roberta->Blenderbot, RoBERTa->Blenderbot class BlenderbotTokenizer(PreTrainedTokenizer): """ Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. @@ -174,6 +170,7 @@ class BlenderbotTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot def __init__( self, vocab_file, @@ -229,12 +226,15 @@ def __init__( self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") @property + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot def vocab_size(self): return len(self.encoder) + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot def bpe(self, token): if token in self.cache: return self.cache[token] @@ -277,6 +277,7 @@ def bpe(self, token): self.cache[token] = word return word + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize with Roberta->Blenderbot, RoBERTa->Blenderbot def _tokenize(self, text): """Tokenize a string.""" bpe_tokens = [] @@ -287,20 +288,24 @@ def _tokenize(self, text): bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id with Roberta->Blenderbot, RoBERTa->Blenderbot def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.encoder.get(token, self.encoder.get(self.unk_token)) + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token with Roberta->Blenderbot, RoBERTa->Blenderbot def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index) + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string with Roberta->Blenderbot, RoBERTa->Blenderbot def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") @@ -330,31 +335,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return vocab_file, merge_file - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Blenderbot sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Blenderbot, RoBERTa->Blenderbot def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: @@ -382,6 +363,7 @@ def get_special_tokens_mask( return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: @@ -405,24 +387,41 @@ def create_token_type_ids_from_sequences( return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization with Roberta->Blenderbot, RoBERTa->Blenderbot def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) - -def get_pairs(word): - """ - Return set of symbol pairs in a word. - - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - - pairs = set(pairs) - return pairs + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Blenderbot sequence has the following format: + - single sequence: ` X ` + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Will be ignored + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + return token_ids_0 + [self.eos_token_id] + + def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: + inputs = [] + for is_user, text in conversation.iter_texts(): + if is_user: + # We need to space prefix as it's being done within blenderbot + inputs.append(" " + text) + else: + # Generated responses should contain them already. + inputs.append(text) + + full_string = " ".join(inputs) + input_ids = self.encode(full_string) + if len(input_ids) > self.model_max_length: + input_ids = input_ids[-self.model_max_length :] + logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.") + return input_ids diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index a72425a3858a60..20f9c9151c14d3 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -47,7 +47,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128} -# Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast with roberta-base->facebook/blenderbot-3B, Roberta->Blenderbot, RoBERTa->Blenderbot class BlenderbotTokenizerFast(PreTrainedTokenizerFast): """ Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 @@ -133,6 +132,7 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast): model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = BlenderbotTokenizer + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot def __init__( self, vocab_file=None, @@ -202,6 +202,7 @@ def __init__( setattr(self.backend_tokenizer, tokenizer_component, new_value) @property + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.mask_token with Roberta->Blenderbot, RoBERTa->Blenderbot def mask_token(self) -> str: """ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not @@ -217,17 +218,19 @@ def mask_token(self) -> str: return str(self._mask_token) @mask_token.setter + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.mask_token with Roberta->Blenderbot, RoBERTa->Blenderbot def mask_token(self, value): """ Overriding the default behavior of the mask token to have it eat the space before it. - This is needed to preserve backward compatibility with all the previously used models based on Blenderbot. + This is needed to preserve backward compatibility with all the previously used models based on Roberta. """ # Mask token behave like a normal word, i.e. include the space before it # So we set lstrip to True value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value self._mask_token = value + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._batch_encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: is_split_into_words = kwargs.get("is_split_into_words", False) assert self.add_prefix_space or not is_split_into_words, ( @@ -237,6 +240,7 @@ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: return super()._batch_encode_plus(*args, **kwargs) + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot def _encode_plus(self, *args, **kwargs) -> BatchEncoding: is_split_into_words = kwargs.get("is_split_into_words", False) @@ -247,17 +251,12 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: return super()._encode_plus(*args, **kwargs) + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] - if token_ids_1 is None: - return output - - return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] - + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: @@ -280,3 +279,35 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Blenderbot sequence has the following format: + - single sequence: ` X ` + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Will be ignored + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + return token_ids_0 + [self.eos_token_id] + + def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: + inputs = [] + for is_user, text in conversation.iter_texts(): + if is_user: + # We need to space prefix as it's being done within blenderbot + inputs.append(" " + text) + else: + # Generated responses should contain them already. + inputs.append(text) + + full_string = " ".join(inputs) + input_ids = self.encode(full_string) + if len(input_ids) > self.model_max_length: + input_ids = input_ids[-self.model_max_length :] + logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.") + return input_ids From a6ae9e82e7cf193f592489c139cc1e3919650baa Mon Sep 17 00:00:00 2001 From: rchan Date: Mon, 10 Oct 2022 13:33:12 +0100 Subject: [PATCH 3/4] Remove copied from line for mask_token.setter --- .../models/blenderbot/tokenization_blenderbot_fast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 20f9c9151c14d3..10ca7d34b2d73e 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -218,7 +218,6 @@ def mask_token(self) -> str: return str(self._mask_token) @mask_token.setter - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.mask_token with Roberta->Blenderbot, RoBERTa->Blenderbot def mask_token(self, value): """ Overriding the default behavior of the mask token to have it eat the space before it. From 4a836cb768d365b8dc2c0242bb8c4e3b43fea2f0 Mon Sep 17 00:00:00 2001 From: rchan Date: Mon, 10 Oct 2022 13:36:20 +0100 Subject: [PATCH 4/4] update output from example in docs --- src/transformers/models/blenderbot/tokenization_blenderbot.py | 4 ++-- .../models/blenderbot/tokenization_blenderbot_fast.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index e7641dbf5b9ca9..ef0a24e06d9ae0 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -100,9 +100,9 @@ class BlenderbotTokenizer(PreTrainedTokenizer): >>> from transformers import BlenderbotTokenizer >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") >>> tokenizer("Hello world")['input_ids'] - [0, 31414, 232, 328, 2] + [6950, 1085, 2] >>> tokenizer(" Hello world")['input_ids'] - [0, 20920, 232, 2] + [6950, 1085, 2] ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 10ca7d34b2d73e..508c643f4d8236 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -59,9 +59,9 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast): >>> from transformers import BlenderbotTokenizerFast >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") >>> tokenizer("Hello world")['input_ids'] - [0, 31414, 232, 328, 2] + [6950, 1085, 2] >>> tokenizer(" Hello world")['input_ids'] - [0, 20920, 232, 2] + [6950, 1085, 2] ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you