From 8a48df7cb13ca7c6301a9c658d8a9846547d6b8b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 12:35:53 -0400 Subject: [PATCH 01/10] Preliminary work on tokenizers --- .../models/m2m_100/tokenization_m2m_100.py | 12 +-- .../models/marian/tokenization_marian.py | 14 +-- .../models/mbart/tokenization_mbart.py | 14 +-- .../models/mbart/tokenization_mbart_fast.py | 14 +-- .../models/mbart50/tokenization_mbart50.py | 14 +-- .../mbart50/tokenization_mbart50_fast.py | 14 +-- .../models/nllb/tokenization_nllb.py | 14 +-- .../models/nllb/tokenization_nllb_fast.py | 14 +-- .../models/plbart/tokenization_plbart.py | 14 +-- .../models/rag/tokenization_rag.py | 12 +-- .../models/tapex/tokenization_tapex.py | 13 +-- src/transformers/tokenization_utils_base.py | 98 ++++++++++++++++++- 12 files changed, 146 insertions(+), 101 deletions(-) diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index f2e9c855bf90..45beeb7f7f3e 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -346,16 +346,12 @@ def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lan inputs["forced_bos_token_id"] = tgt_lang_id return inputs - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield + def _switch_to_input_mode(self): self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_targett_mode(self): + self.set_tgt_lang_special_tokens(self.tgt_lang) + def set_src_lang_special_tokens(self, src_lang: str) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" lang_token = self.get_lang_token(src_lang) diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 62f145e7b798..da9cb66f8603 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -281,18 +281,14 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> Lis # We don't expect to process pairs, but leave the pair logic for API consistency return token_ids_0 + token_ids_1 + [self.eos_token_id] - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ + def _switch_to_input_mode(self): + self.current_spm = self.spm_source + self.current_encoder = self.encoder + + def _switch_to_target_mode(self): self.current_spm = self.spm_target if self.separate_vocabs: self.current_encoder = self.target_encoder - yield - self.current_spm = self.spm_source - self.current_encoder = self.encoder @property def vocab_size(self) -> int: diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 2517dfb584bb..7621cc6bde18 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -340,15 +340,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 52902e3a40f0..6bd45cfcc02d 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -240,15 +240,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 145a546c1810..7c1777349c1c 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -337,15 +337,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang: str) -> None: """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py index 28fb726c476d..09fe1b94cbe1 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -211,15 +211,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang: str) -> None: """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index ef0ee942bfa9..24d4d21eb5c9 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -386,15 +386,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index fa4eaa4c5a80..0785bb4bcf05 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -284,15 +284,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index 4a3ee1cdcd11..f14ee506e6ff 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -441,15 +441,11 @@ def prepare_seq2seq_batch( self.tgt_lang = tgt_lang return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.set_tgt_lang_special_tokens(self.tgt_lang) - yield - self.set_src_lang_special_tokens(self.src_lang) + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].""" diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index d92ca1788faa..a1af9af6168b 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -68,16 +68,12 @@ def batch_decode(self, *args, **kwargs): def decode(self, *args, **kwargs): return self.generator.decode(*args, **kwargs) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.current_tokenizer = self.generator - yield + def _switch_to_input_mode(self): self.current_tokenizer = self.question_encoder + def _switch_to_target_mode(self): + self.current_tokenizer = self.generator + def prepare_seq2seq_batch( self, src_texts: List[str], diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py index ea1dc0dcc492..5bdc307ef665 100644 --- a/src/transformers/models/tapex/tokenization_tapex.py +++ b/src/transformers/models/tapex/tokenization_tapex.py @@ -1330,17 +1330,12 @@ def _target_encode_plus( verbose=verbose, ) - @contextmanager - def as_target_tokenizer(self): - """ - Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to - sequence-to-sequence models that need a slightly different processing for the labels. - """ - self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET - yield - # restore the call function + def _switch_to_input_mode(self): self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE + def _switch_to_target_mode(self): + self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET + def prepare_table_query( self, table, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 776c9a69db4c..31c0d3ddb0d8 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2431,8 +2431,12 @@ def _get_padding_truncation_strategies( @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, @@ -2455,15 +2459,82 @@ def __call__( sequences. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`str`, `List[str]`, `List[List[str]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - text_pair (`str`, `List[str]`, `List[List[str]]`): + text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - """ + text_target (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + """ + # To avoid duplicating + all_kwargs = dict( + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + ) + all_kwargs.update(kwargs) + if text is None and text_target is None: + raise ValueError("You need to specify either `text` or `text_target`.") + if text is not None: + self._switch_to_input_mode() + encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) + if text_target is not None: + self._switch_to_target_mode() + target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs) + # Leave back tokenizer in input mode + self._switch_to_input_mode() + + if text_target is None: + return encodings + elif text is None: + return target_encodings + else: + encodings["labels"] = target_encodings["input_ids"] + return encodings + + def _call_one( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -3456,13 +3527,32 @@ def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Opt ) self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True + def _switch_to_input_mode(self): + """ + Private method to put the tokenizer in input mode (when it has different modes for input/outputs) + """ + pass + + def _switch_to_target_mode(self): + """ + Private method to put the tokenizer in target mode (when it has different modes for input/outputs) + """ + pass + @contextmanager def as_target_tokenizer(self): """ Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels. """ + warnings.warn( + "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your " + "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as " + "your input texts if you use the same keyword arguments, or in a separate call." + ) + self._switch_to_target_mode() yield + self._switch_to_input_mode() @classmethod def register_for_auto_class(cls, auto_class="AutoTokenizer"): From f529476b492c4a67d3264d6c6cc7b5d9de2f4e6e Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 14:10:32 -0400 Subject: [PATCH 02/10] Quality + fix tests --- src/transformers/models/m2m_100/tokenization_m2m_100.py | 3 +-- src/transformers/models/marian/tokenization_marian.py | 1 - src/transformers/models/mbart/tokenization_mbart.py | 1 - src/transformers/models/mbart/tokenization_mbart_fast.py | 1 - src/transformers/models/mbart50/tokenization_mbart50.py | 1 - .../models/mbart50/tokenization_mbart50_fast.py | 1 - src/transformers/models/nllb/tokenization_nllb.py | 1 - src/transformers/models/nllb/tokenization_nllb_fast.py | 1 - src/transformers/models/plbart/tokenization_plbart.py | 1 - src/transformers/models/rag/tokenization_rag.py | 1 - src/transformers/models/tapex/tokenization_tapex.py | 1 - src/transformers/tokenization_utils_base.py | 8 ++++++-- 12 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index 45beeb7f7f3e..c11c8782f227 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -14,7 +14,6 @@ """Tokenization classes for M2M100.""" import json import os -from contextlib import contextmanager from pathlib import Path from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple, Union @@ -349,7 +348,7 @@ def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lan def _switch_to_input_mode(self): self.set_src_lang_special_tokens(self.src_lang) - def _switch_to_targett_mode(self): + def _switch_to_target_mode(self): self.set_tgt_lang_special_tokens(self.tgt_lang) def set_src_lang_special_tokens(self, src_lang: str) -> None: diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index da9cb66f8603..6967a675813c 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -15,7 +15,6 @@ import os import re import warnings -from contextlib import contextmanager from pathlib import Path from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple, Union diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 7621cc6bde18..431908b0e97c 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 6bd45cfcc02d..85937a5fd8ed 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import List, Optional, Tuple diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 7c1777349c1c..274b7f95f829 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py index 09fe1b94cbe1..b0f41a0fd719 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import List, Optional, Tuple diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 24d4d21eb5c9..d0c0ce8b74ec 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 0785bb4bcf05..404b7c093e05 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import List, Optional, Tuple diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index f14ee506e6ff..93e109473bdf 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -14,7 +14,6 @@ # limitations under the License. import os -from contextlib import contextmanager from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index a1af9af6168b..485c2c448373 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -15,7 +15,6 @@ """Tokenization classes for RAG.""" import os import warnings -from contextlib import contextmanager from typing import List, Optional from ...tokenization_utils_base import BatchEncoding diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py index 5bdc307ef665..d9afd160d38d 100644 --- a/src/transformers/models/tapex/tokenization_tapex.py +++ b/src/transformers/models/tapex/tokenization_tapex.py @@ -17,7 +17,6 @@ import json import os import random -from contextlib import contextmanager from functools import lru_cache from typing import Dict, List, Optional, Tuple, Union diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 31c0d3ddb0d8..3b10c0a3d847 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1501,7 +1501,7 @@ def __init__(self, **kwargs): self.deprecation_warnings = ( {} ) # Use to store when we have already noticed a deprecation warning (avoid overlogging). - + self._in_target_context_manager = False super().__init__(**kwargs) @property @@ -2498,7 +2498,9 @@ def __call__( if text is None and text_target is None: raise ValueError("You need to specify either `text` or `text_target`.") if text is not None: - self._switch_to_input_mode() + # The context manager will send the inputs as normal texts and not text_target. + if not self._in_target_context_manager: + self._switch_to_input_mode() encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) if text_target is not None: self._switch_to_target_mode() @@ -3551,7 +3553,9 @@ def as_target_tokenizer(self): "your input texts if you use the same keyword arguments, or in a separate call." ) self._switch_to_target_mode() + self._in_target_context_manager = True yield + self._in_target_context_manager = False self._switch_to_input_mode() @classmethod From 8b2af7275da1de3b7c84385bbff048c5ec43e324 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 14:38:28 -0400 Subject: [PATCH 03/10] Treat processors --- .../models/mctct/processing_mctct.py | 62 +++++++++++++++++- .../processing_speech_to_text.py | 39 ++++++++++- .../processing_speech_to_text_2.py | 39 ++++++++++- .../models/trocr/processing_trocr.py | 39 ++++++++++- .../models/wav2vec2/processing_wav2vec2.py | 61 +++++++++++++++++- .../processing_wav2vec2_with_lm.py | 64 ++++++++++++++++++- src/transformers/tokenization_utils_base.py | 3 +- 7 files changed, 296 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/mctct/processing_mctct.py index 0892f345928b..8cea9ce2a45e 100644 --- a/src/transformers/models/mctct/processing_mctct.py +++ b/src/transformers/models/mctct/processing_mctct.py @@ -15,6 +15,7 @@ """ Speech processor class for M-CTC-T """ +import warnings from contextlib import contextmanager from ...processing_utils import ProcessorMixin @@ -39,6 +40,7 @@ class MCTCTProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor + self._in_target_context_manager = False def __call__(self, *args, **kwargs): """ @@ -47,7 +49,35 @@ def __call__(self, *args, **kwargs): [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's [`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def batch_decode(self, *args, **kwargs): """ @@ -63,7 +93,28 @@ def pad(self, *args, **kwargs): [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. """ - return self.current_processor.pad(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor.pad(*args, **kwargs) + + input_features = kwargs.pop("input_features", None) + labels = kwargs.pop("labels", None) + if len(args) > 1: + input_features = args[0] + args = args[1:] + + if input_features is not None: + input_features = self.feature_extractor.pad(input_features, *args, **kwargs) + if labels is not None: + labels = self.tokenizer.pad(labels, **kwargs) + + if labels is None: + return input_features + elif input_features is None: + return labels + else: + input_features["labels"] = labels["input_ids"] + return input_features def decode(self, *args, **kwargs): """ @@ -77,6 +128,13 @@ def as_target_processor(self): """ Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py index 969df9d108fe..3f047932030f 100644 --- a/src/transformers/models/speech_to_text/processing_speech_to_text.py +++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py @@ -15,6 +15,7 @@ """ Speech processor class for Speech2Text """ +import warnings from contextlib import contextmanager from ...processing_utils import ProcessorMixin @@ -41,6 +42,7 @@ class Speech2TextProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor + self._in_target_context_manager = False def __call__(self, *args, **kwargs): """ @@ -50,7 +52,35 @@ def __call__(self, *args, **kwargs): [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def batch_decode(self, *args, **kwargs): """ @@ -72,6 +102,13 @@ def as_target_processor(self): Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Speech2Text. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py index 28189ba88198..c40831d0214a 100644 --- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py @@ -15,6 +15,7 @@ """ Speech processor class for Speech2Text2 """ +import warnings from contextlib import contextmanager from ...processing_utils import ProcessorMixin @@ -40,6 +41,7 @@ class Speech2Text2Processor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor + self._in_target_context_manager = False def __call__(self, *args, **kwargs): """ @@ -49,7 +51,35 @@ def __call__(self, *args, **kwargs): Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def batch_decode(self, *args, **kwargs): """ @@ -71,6 +101,13 @@ def as_target_processor(self): Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Speech2Text2. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py index 2c7893a0915b..44a276fd63ae 100644 --- a/src/transformers/models/trocr/processing_trocr.py +++ b/src/transformers/models/trocr/processing_trocr.py @@ -15,6 +15,7 @@ """ Processor class for TrOCR. """ +import warnings from contextlib import contextmanager from ...processing_utils import ProcessorMixin @@ -40,6 +41,7 @@ class TrOCRProcessor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor + self._in_target_context_manager = False def __call__(self, *args, **kwargs): """ @@ -48,7 +50,35 @@ def __call__(self, *args, **kwargs): [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def batch_decode(self, *args, **kwargs): """ @@ -69,6 +99,13 @@ def as_target_processor(self): """ Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index 1470c254dc63..55af0f1f6ccd 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -43,6 +43,7 @@ class Wav2Vec2Processor(ProcessorMixin): def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor + self._in_target_context_manager = False @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): @@ -70,7 +71,35 @@ def __call__(self, *args, **kwargs): [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def pad(self, *args, **kwargs): """ @@ -79,7 +108,28 @@ def pad(self, *args, **kwargs): [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. """ - return self.current_processor.pad(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor.pad(*args, **kwargs) + + input_features = kwargs.pop("input_features", None) + labels = kwargs.pop("labels", None) + if len(args) > 1: + input_features = args[0] + args = args[1:] + + if input_features is not None: + input_features = self.feature_extractor.pad(input_features, *args, **kwargs) + if labels is not None: + labels = self.tokenizer.pad(labels, **kwargs) + + if labels is None: + return input_features + elif input_features is None: + return labels + else: + input_features["labels"] = labels["input_ids"] + return input_features def batch_decode(self, *args, **kwargs): """ @@ -101,6 +151,13 @@ def as_target_processor(self): Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning Wav2Vec2. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index 4e7da075261b..534ea566ccc5 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -16,6 +16,7 @@ Speech processor class for Wav2Vec2 """ import os +import warnings from contextlib import contextmanager from dataclasses import dataclass from multiprocessing import get_context @@ -99,6 +100,7 @@ def __init__( self.decoder = decoder self.current_processor = self.feature_extractor + self._in_target_context_manager = False def save_pretrained(self, save_directory): super().save_pretrained(save_directory) @@ -214,7 +216,35 @@ def __call__(self, *args, **kwargs): Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. """ - return self.current_processor(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs def pad(self, *args, **kwargs): """ @@ -224,7 +254,28 @@ def pad(self, *args, **kwargs): Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. """ - return self.current_processor.pad(*args, **kwargs) + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor.pad(*args, **kwargs) + + input_features = kwargs.pop("input_features", None) + labels = kwargs.pop("labels", None) + if len(args) > 1: + input_features = args[0] + args = args[1:] + + if input_features is not None: + input_features = self.feature_extractor.pad(input_features, *args, **kwargs) + if labels is not None: + labels = self.tokenizer.pad(labels, **kwargs) + + if labels is None: + return input_features + elif input_features is None: + return labels + else: + input_features["labels"] = labels["input_ids"] + return input_features def batch_decode( self, @@ -486,9 +537,16 @@ def decode( @contextmanager def as_target_processor(self): """ - Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning + Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning Wav2Vec2. """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True self.current_processor = self.tokenizer yield self.current_processor = self.feature_extractor + self._in_target_context_manager = False diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 3b10c0a3d847..b8ab1e09c445 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2498,7 +2498,8 @@ def __call__( if text is None and text_target is None: raise ValueError("You need to specify either `text` or `text_target`.") if text is not None: - # The context manager will send the inputs as normal texts and not text_target. + # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the + # input mode in this case. if not self._in_target_context_manager: self._switch_to_input_mode() encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) From 5bfa57bf9152e5d8f48dbf2726f438464ec3222b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Wed, 27 Jul 2022 15:33:30 -0400 Subject: [PATCH 04/10] Fix pad --- src/transformers/models/mctct/processing_mctct.py | 2 +- src/transformers/models/wav2vec2/processing_wav2vec2.py | 2 +- .../models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/mctct/processing_mctct.py index 8cea9ce2a45e..2e05020196ac 100644 --- a/src/transformers/models/mctct/processing_mctct.py +++ b/src/transformers/models/mctct/processing_mctct.py @@ -99,7 +99,7 @@ def pad(self, *args, **kwargs): input_features = kwargs.pop("input_features", None) labels = kwargs.pop("labels", None) - if len(args) > 1: + if len(args) > 0: input_features = args[0] args = args[1:] diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index 55af0f1f6ccd..5763d4d59eea 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -114,7 +114,7 @@ def pad(self, *args, **kwargs): input_features = kwargs.pop("input_features", None) labels = kwargs.pop("labels", None) - if len(args) > 1: + if len(args) > 0: input_features = args[0] args = args[1:] diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index 534ea566ccc5..f09b5eb922ab 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -260,7 +260,7 @@ def pad(self, *args, **kwargs): input_features = kwargs.pop("input_features", None) labels = kwargs.pop("labels", None) - if len(args) > 1: + if len(args) > 0: input_features = args[0] args = args[1:] From 7ea9d1d37e0fd9a18eba1db541b1acf580a743f7 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Thu, 28 Jul 2022 11:22:32 -0400 Subject: [PATCH 05/10] Remove all uses of in tests, docs and examples --- docs/source/en/model_doc/mctct.mdx | 1 - .../en/model_doc/speech-encoder-decoder.mdx | 10 +++----- docs/source/en/model_doc/speech_to_text.mdx | 1 - docs/source/en/model_doc/speech_to_text_2.mdx | 1 - docs/source/en/model_doc/trocr.mdx | 1 - docs/source/en/model_doc/wav2vec2.mdx | 2 -- docs/source/en/preprocessing.mdx | 4 +--- docs/source/en/tasks/asr.mdx | 19 ++++----------- docs/source/es/preprocessing.mdx | 4 +--- docs/source/it/preprocessing.mdx | 4 +--- .../run_speech_recognition_ctc.py | 13 +++++----- .../run_speech_recognition_ctc_bnb.py | 13 +++++----- .../run_speech_recognition_ctc_streaming.py | 13 +++++----- .../research_projects/wav2vec2/run_asr.py | 22 ++++++++--------- .../wav2vec2/run_common_voice.py | 24 +++++++++---------- .../xtreme-s/run_xtreme_s.py | 13 +++++----- .../models/hubert/modeling_tf_hubert.py | 5 ++-- .../modeling_speech_encoder_decoder.py | 3 +-- .../models/wav2vec2/modeling_tf_wav2vec2.py | 5 ++-- src/transformers/utils/doc.py | 6 ++--- tests/models/mctct/test_processor_mctct.py | 3 +-- .../test_processor_speech_to_text.py | 3 +-- .../wav2vec2/test_processor_wav2vec2.py | 3 +-- .../test_processor_wav2vec2_with_lm.py | 3 +-- 24 files changed, 69 insertions(+), 107 deletions(-) diff --git a/docs/source/en/model_doc/mctct.mdx b/docs/source/en/model_doc/mctct.mdx index f064f1e3d020..531508cfa9df 100644 --- a/docs/source/en/model_doc/mctct.mdx +++ b/docs/source/en/model_doc/mctct.mdx @@ -48,7 +48,6 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi - save_pretrained - batch_decode - decode - - as_target_processor ## MCTCTModel diff --git a/docs/source/en/model_doc/speech-encoder-decoder.mdx b/docs/source/en/model_doc/speech-encoder-decoder.mdx index 9aee71ed6669..7294b55d3c31 100644 --- a/docs/source/en/model_doc/speech-encoder-decoder.mdx +++ b/docs/source/en/model_doc/speech-encoder-decoder.mdx @@ -85,7 +85,7 @@ As you can see, only 2 inputs are required for the model in order to compute a l speech inputs) and `labels` (which are the `input_ids` of the encoded target sequence). ```python ->>> from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel +>>> from transformers import BertTokenizer, Wav2Vec2Processor, SpeechEncoderDecoderModel >>> from datasets import load_dataset >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") @@ -99,14 +99,10 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq >>> # load a speech input >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values - ->>> # load its corresponding transcription ->>> with processor.as_target_processor(): -... labels = processor(ds[0]["text"], return_tensors="pt").input_ids +>>> input_features = processor(ds[0]["audio"]["array"], text=ds[0]["text"], return_tensors="pt") >>> # the forward function automatically creates the correct decoder_input_ids ->>> loss = model(input_values, labels=labels).loss +>>> loss = model(**input_features).loss >>> loss.backward() ``` diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx index e11d95442d26..9d855fceb480 100644 --- a/docs/source/en/model_doc/speech_to_text.mdx +++ b/docs/source/en/model_doc/speech_to_text.mdx @@ -120,7 +120,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look - save_pretrained - batch_decode - decode - - as_target_processor ## Speech2TextModel diff --git a/docs/source/en/model_doc/speech_to_text_2.mdx b/docs/source/en/model_doc/speech_to_text_2.mdx index 72754b67aab9..ce9e29c32e82 100644 --- a/docs/source/en/model_doc/speech_to_text_2.mdx +++ b/docs/source/en/model_doc/speech_to_text_2.mdx @@ -114,7 +114,6 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S - save_pretrained - batch_decode - decode - - as_target_processor ## Speech2Text2ForCausalLM diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx index 08de107e434c..37dc6f545595 100644 --- a/docs/source/en/model_doc/trocr.mdx +++ b/docs/source/en/model_doc/trocr.mdx @@ -94,7 +94,6 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC - save_pretrained - batch_decode - decode - - as_target_processor ## TrOCRForCausalLM diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.mdx index 9b2f13ea4541..eaca36be4673 100644 --- a/docs/source/en/model_doc/wav2vec2.mdx +++ b/docs/source/en/model_doc/wav2vec2.mdx @@ -62,7 +62,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv - save_pretrained - batch_decode - decode - - as_target_processor ## Wav2Vec2ProcessorWithLM @@ -73,7 +72,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv - save_pretrained - batch_decode - decode - - as_target_processor ## Wav2Vec2 specific outputs diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx index f9bdae3603af..e67741633acb 100644 --- a/docs/source/en/preprocessing.mdx +++ b/docs/source/en/preprocessing.mdx @@ -486,10 +486,8 @@ A processor combines a feature extractor and tokenizer. Load a processor with [` >>> def prepare_dataset(example): ... audio = example["audio"] -... example["input_values"] = processor(audio["array"], sampling_rate=16000) +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) -... with processor.as_target_processor(): -... example["labels"] = processor(example["text"]).input_ids ... return example ``` diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx index 8ceea824f4ee..daa627aaf131 100644 --- a/docs/source/en/tasks/asr.mdx +++ b/docs/source/en/tasks/asr.mdx @@ -109,11 +109,10 @@ The preprocessing function needs to: >>> def prepare_dataset(batch): ... audio = batch["audio"] -... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] +... batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] ... batch["input_length"] = len(batch["input_values"]) -... with processor.as_target_processor(): -... batch["labels"] = processor(batch["transcription"]).input_ids +... batch["labels"] = processor(text=batch["transcription"]).input_ids ... return batch ``` @@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ ... input_features = [{"input_values": feature["input_values"]} for feature in features] ... label_features = [{"input_ids": feature["labels"]} for feature in features] -... batch = self.processor.pad( -... input_features, -... padding=self.padding, -... return_tensors="pt", -... ) -... with self.processor.as_target_processor(): -... labels_batch = self.processor.pad( -... label_features, -... padding=self.padding, -... return_tensors="pt", -... ) +... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") + +... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") ... # replace padding with -100 to ignore loss correctly ... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) diff --git a/docs/source/es/preprocessing.mdx b/docs/source/es/preprocessing.mdx index 3e749ca2cde8..9608bf58d9d6 100644 --- a/docs/source/es/preprocessing.mdx +++ b/docs/source/es/preprocessing.mdx @@ -471,10 +471,8 @@ Un processor combina un extractor de características y un tokenizador. Cargue u >>> def prepare_dataset(example): ... audio = example["audio"] -... example["input_values"] = processor(audio["array"], sampling_rate=16000) +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) -... with processor.as_target_processor(): -... example["labels"] = processor(example["text"]).input_ids ... return example ``` diff --git a/docs/source/it/preprocessing.mdx b/docs/source/it/preprocessing.mdx index 5a245fe84302..a57ff9df9151 100644 --- a/docs/source/it/preprocessing.mdx +++ b/docs/source/it/preprocessing.mdx @@ -471,10 +471,8 @@ Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un >>> def prepare_dataset(example): ... audio = example["audio"] -... example["input_values"] = processor(audio["array"], sampling_rate=16000) +... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) -... with processor.as_target_processor(): -... example["labels"] = processor(example["text"]).input_ids ... return example ``` diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 25e817637779..140e1bb64b04 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -305,13 +305,12 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> return_tensors="pt", ) - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py index 521036c78e4b..afa3397eb430 100755 --- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py +++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py @@ -304,13 +304,12 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> return_tensors="pt", ) - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py index d357bc469649..57f54048a523 100644 --- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py +++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py @@ -301,13 +301,12 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> return_tensors="pt", ) - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py index bb34e0a0c71a..ab9db11d2a02 100755 --- a/examples/research_projects/wav2vec2/run_asr.py +++ b/examples/research_projects/wav2vec2/run_asr.py @@ -266,14 +266,13 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> pad_to_multiple_of=self.pad_to_multiple_of, return_tensors="pt", ) - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - max_length=self.max_length_labels, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + max_length=self.max_length_labels, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) @@ -419,9 +418,10 @@ def prepare_dataset(batch): len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." - batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values - with processor.as_target_processor(): - batch["labels"] = processor(batch[data_args.target_text_column]).input_ids + processed_batch = processor( + audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0] + ) + batch.update(processed_batch) return batch train_dataset = train_dataset.map( diff --git a/examples/research_projects/wav2vec2/run_common_voice.py b/examples/research_projects/wav2vec2/run_common_voice.py index b8480d3c7d1c..10a3a77fa758 100644 --- a/examples/research_projects/wav2vec2/run_common_voice.py +++ b/examples/research_projects/wav2vec2/run_common_voice.py @@ -185,14 +185,13 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> pad_to_multiple_of=self.pad_to_multiple_of, return_tensors="pt", ) - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - max_length=self.max_length_labels, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + max_length=self.max_length_labels, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) @@ -414,10 +413,11 @@ def prepare_dataset(batch): assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." - batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values - # Setup the processor for targets - with processor.as_target_processor(): - batch["labels"] = processor(batch["target_text"]).input_ids + + processed_batch = processor( + audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0] + ) + batch.update(processed_batch) return batch train_dataset = train_dataset.map( diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py index 972c6d5462ff..d3e4f5cb38ab 100644 --- a/examples/research_projects/xtreme-s/run_xtreme_s.py +++ b/examples/research_projects/xtreme-s/run_xtreme_s.py @@ -349,13 +349,12 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> if self.pad_labels: label_features = [{"input_ids": feature["labels"]} for feature in features] - with self.processor.as_target_processor(): - labels_batch = self.processor.pad( - label_features, - padding=self.padding, - pad_to_multiple_of=self.pad_to_multiple_of_labels, - return_tensors="pt", - ) + labels_batch = self.processor.pad( + labels=label_features, + padding=self.padding, + pad_to_multiple_of=self.pad_to_multiple_of_labels, + return_tensors="pt", + ) # replace padding with -100 to ignore loss correctly labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index fc6e5b13d408..f078b5d0cfc7 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1612,9 +1612,8 @@ def call( >>> # compute loss >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST" - >>> # wrap processor as target processor to encode labels - >>> with processor.as_target_processor(): - ... labels = processor(transcription, return_tensors="tf").input_values + >>> # Pass the transcription as text to encode labels + >>> labels = processor(text=transcription, return_tensors="tf").input_values >>> loss = model(input_values, labels=labels).loss ```""" diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index bf67c6d54466..388be2449947 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -482,8 +482,7 @@ def forward( 'Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.' >>> # Training: Train model on English transcription - >>> with processor.as_target_processor(): - ... labels = processor(ds[0]["text"], return_tensors="pt").input_ids + >>> labels = processor(text=ds[0]["text"], return_tensors="pt").input_ids >>> loss = model(input_values, labels=labels).loss >>> loss.backward() diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index fed0414863a5..854831e45a09 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1650,9 +1650,8 @@ def call( >>> # compute loss >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST" - >>> # wrap processor as target processor to encode labels - >>> with processor.as_target_processor(): - ... labels = processor(transcription, return_tensors="tf").input_ids + >>> # Pass transcription as `text` to encode labels + >>> labels = processor(text=transcription, return_tensors="tf").input_ids >>> loss = model(input_values, labels=labels).loss ```""" diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index 8f0caf825bba..6761dec9c969 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -428,8 +428,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None): ``` ```python - >>> with processor.as_target_processor(): - ... inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids + >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids >>> # compute loss >>> loss = model(**inputs).loss @@ -849,8 +848,7 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None): ``` ```python - >>> with processor.as_target_processor(): - ... inputs["labels"] = processor(dataset[0]["text"], return_tensors="tf").input_ids + >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="tf").input_ids >>> # compute loss >>> loss = model(**inputs).loss diff --git a/tests/models/mctct/test_processor_mctct.py b/tests/models/mctct/test_processor_mctct.py index 83201f410215..821e44b48e24 100644 --- a/tests/models/mctct/test_processor_mctct.py +++ b/tests/models/mctct/test_processor_mctct.py @@ -125,8 +125,7 @@ def test_tokenizer(self): input_str = "This is a test string" - with processor.as_target_processor(): - encoded_processor = processor(input_str) + encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str) diff --git a/tests/models/speech_to_text/test_processor_speech_to_text.py b/tests/models/speech_to_text/test_processor_speech_to_text.py index e6e43f1bb8d7..d519f005d3eb 100644 --- a/tests/models/speech_to_text/test_processor_speech_to_text.py +++ b/tests/models/speech_to_text/test_processor_speech_to_text.py @@ -125,8 +125,7 @@ def test_tokenizer(self): input_str = "This is a test string" - with processor.as_target_processor(): - encoded_processor = processor(input_str) + encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str) diff --git a/tests/models/wav2vec2/test_processor_wav2vec2.py b/tests/models/wav2vec2/test_processor_wav2vec2.py index 8b7188f8ebc0..5f1c259061c4 100644 --- a/tests/models/wav2vec2/test_processor_wav2vec2.py +++ b/tests/models/wav2vec2/test_processor_wav2vec2.py @@ -118,8 +118,7 @@ def test_tokenizer(self): input_str = "This is a test string" - with processor.as_target_processor(): - encoded_processor = processor(input_str) + encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str) diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py index f5b3eea926d8..d66a5923868d 100644 --- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py +++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py @@ -164,8 +164,7 @@ def test_tokenizer(self): input_str = "This is a test string" - with processor.as_target_processor(): - encoded_processor = processor(input_str) + encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str) From 024483cb69322a76b7541713f66eb7fffa20291c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Thu, 28 Jul 2022 12:33:15 -0400 Subject: [PATCH 06/10] Replace all as_target_tokenizer --- docs/source/en/model_doc/m2m_100.mdx | 4 +- docs/source/en/model_doc/marian.mdx | 1 - docs/source/en/model_doc/mbart.mdx | 17 +++------ docs/source/en/model_doc/nllb.mdx | 1 - docs/source/en/model_doc/plbart.mdx | 12 ++---- docs/source/en/tasks/summarization.mdx | 5 +-- docs/source/en/tasks/translation.mdx | 7 +--- .../run_image_captioning_flax.py | 9 ++--- .../summarization/run_summarization_flax.py | 11 ++++-- .../question-answering/run_seq2seq_qa.py | 10 ++--- .../summarization/run_summarization.py | 5 +-- .../run_summarization_no_trainer.py | 5 +-- .../pytorch/translation/run_translation.py | 5 +-- .../translation/run_translation_no_trainer.py | 5 +-- .../tapex/run_wikisql_with_tapex.py | 13 +++---- .../run_wikitablequestions_with_tapex.py | 13 +++---- .../summarization/run_summarization.py | 5 +-- .../tensorflow/translation/run_translation.py | 5 +-- .../models/m2m_100/tokenization_m2m_100.py | 6 +-- .../models/marian/tokenization_marian.py | 5 +-- .../models/mbart/tokenization_mbart.py | 5 +-- .../models/mbart/tokenization_mbart_fast.py | 5 +-- .../models/mbart50/tokenization_mbart50.py | 6 +-- .../mbart50/tokenization_mbart50_fast.py | 6 +-- .../models/mt5/modeling_flax_mt5.py | 9 ++--- src/transformers/models/mt5/modeling_mt5.py | 9 ++--- .../models/mt5/modeling_tf_mt5.py | 9 ++--- .../models/nllb/tokenization_nllb.py | 5 +-- .../models/nllb/tokenization_nllb_fast.py | 5 +-- .../models/plbart/tokenization_plbart.py | 5 +-- src/transformers/models/rag/modeling_rag.py | 6 +-- .../models/rag/tokenization_rag.py | 23 ++++++------ src/transformers/tokenization_utils_base.py | 30 ++++++++------- tests/models/bart/test_tokenization_bart.py | 8 ++-- tests/models/byt5/test_tokenization_byt5.py | 13 +++---- .../models/canine/test_tokenization_canine.py | 5 ++- .../m2m_100/test_tokenization_m2m_100.py | 20 +++++----- tests/models/marian/test_modeling_marian.py | 5 +-- .../models/marian/test_tokenization_marian.py | 5 +-- tests/models/mbart/test_tokenization_mbart.py | 37 ++++++++----------- .../mbart50/test_tokenization_mbart50.py | 37 ++++++++----------- tests/models/mvp/test_tokenization_mvp.py | 11 ++---- tests/models/nllb/test_tokenization_nllb.py | 23 +++++------- .../pegasus/test_tokenization_pegasus.py | 14 +++---- .../perceiver/test_tokenization_perceiver.py | 7 ++-- .../models/plbart/test_tokenization_plbart.py | 34 +++++++---------- tests/models/t5/test_tokenization_t5.py | 13 +++---- tests/models/tapex/test_tokenization_tapex.py | 37 +++++++++---------- 48 files changed, 217 insertions(+), 319 deletions(-) diff --git a/docs/source/en/model_doc/m2m_100.mdx b/docs/source/en/model_doc/m2m_100.mdx index 65e119aa4eea..f0a7714d2418 100644 --- a/docs/source/en/model_doc/m2m_100.mdx +++ b/docs/source/en/model_doc/m2m_100.mdx @@ -55,9 +55,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en src_text = "Life is like a box of chocolates." tgt_text = "La vie est comme une boîte de chocolat." -model_inputs = tokenizer(src_text, return_tensors="pt") -with tokenizer.as_target_tokenizer(): - labels = tokenizer(tgt_text, return_tensors="pt").input_ids +model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") loss = model(**model_inputs, labels=labels) # forward pass ``` diff --git a/docs/source/en/model_doc/marian.mdx b/docs/source/en/model_doc/marian.mdx index 7b10c9309a0b..b415e9413c40 100644 --- a/docs/source/en/model_doc/marian.mdx +++ b/docs/source/en/model_doc/marian.mdx @@ -155,7 +155,6 @@ Example of translating english to many romance languages, using old-style 2 char ## MarianTokenizer [[autodoc]] MarianTokenizer - - as_target_tokenizer ## MarianModel diff --git a/docs/source/en/model_doc/mbart.mdx b/docs/source/en/model_doc/mbart.mdx index 0f3d82ce5dac..b24e31f33c9f 100644 --- a/docs/source/en/model_doc/mbart.mdx +++ b/docs/source/en/model_doc/mbart.mdx @@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. -The regular [`~MBartTokenizer.__call__`] will encode source text format, and it should be wrapped -inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode target text format. +The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text` +keyword, and target text format passed with the `text_label` keyword argument. - Supervised training @@ -46,13 +46,11 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" ->>> inputs = tokenizer(example_english_phrase, return_tensors="pt") ->>> with tokenizer.as_target_tokenizer(): -... labels = tokenizer(expected_translation_romanian, return_tensors="pt") +>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt") >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") >>> # forward pass ->>> model(**inputs, labels=batch["labels"]) +>>> model(**inputs) ``` - Generation @@ -108,11 +106,9 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_ src_text = " UN Chief Says There Is No Military Solution in Syria" tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" -model_inputs = tokenizer(src_text, return_tensors="pt") -with tokenizer.as_target_tokenizer(): - labels = tokenizer(tgt_text, return_tensors="pt").input_ids +model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") -model(**model_inputs, labels=labels) # forward pass +model(**model_inputs) # forward pass ``` - Generation @@ -154,7 +150,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) ## MBartTokenizer [[autodoc]] MBartTokenizer - - as_target_tokenizer - build_inputs_with_special_tokens ## MBartTokenizerFast diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx index 477ef1ca83ec..d2c0089fa3a1 100644 --- a/docs/source/en/model_doc/nllb.mdx +++ b/docs/source/en/model_doc/nllb.mdx @@ -91,7 +91,6 @@ UN-Chef sagt, es gibt keine militärische Lösung in Syrien ## NllbTokenizer [[autodoc]] NllbTokenizer - - as_target_tokenizer - build_inputs_with_special_tokens ## NllbTokenizerFast diff --git a/docs/source/en/model_doc/plbart.mdx b/docs/source/en/model_doc/plbart.mdx index 6e3e4a5b7773..0755bb9a56e1 100644 --- a/docs/source/en/model_doc/plbart.mdx +++ b/docs/source/en/model_doc/plbart.mdx @@ -45,8 +45,9 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this. -In cases where the language code is needed, The regular [`~PLBartTokenizer.__call__`] will encode source text format, and it should be wrapped -inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode target text format. +In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format +when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if +it's passed with the `text_target` keyword argument. - Supervised training @@ -56,11 +57,7 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" >>> expected_translation_english = "Returns the maximum value of a b c." ->>> inputs = tokenizer(example_python_phrase, return_tensors="pt") ->>> with tokenizer.as_target_tokenizer(): -... labels = tokenizer(expected_translation_english, return_tensors="pt") ->>> inputs["labels"] = labels["input_ids"] ->>> # forward pass +>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt") >>> model(**inputs) ``` @@ -88,7 +85,6 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta ## PLBartTokenizer [[autodoc]] PLBartTokenizer - - as_target_tokenizer - build_inputs_with_special_tokens ## PLBartModel diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx index 1c73c7396e64..f636141a1507 100644 --- a/docs/source/en/tasks/summarization.mdx +++ b/docs/source/en/tasks/summarization.mdx @@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`: The preprocessing function needs to: 1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks. -2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels. +2. Use the keyword `text_target` argument when tokenizing labels. 3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. ```py @@ -78,8 +78,7 @@ The preprocessing function needs to: ... inputs = [prefix + doc for doc in examples["text"]] ... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) -... with tokenizer.as_target_tokenizer(): -... labels = tokenizer(examples["summary"], max_length=128, truncation=True) +... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) ... model_inputs["labels"] = labels["input_ids"] ... return model_inputs diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx index 4f628b06db15..d17b87041418 100644 --- a/docs/source/en/tasks/translation.mdx +++ b/docs/source/en/tasks/translation.mdx @@ -78,12 +78,7 @@ The preprocessing function needs to: >>> def preprocess_function(examples): ... inputs = [prefix + example[source_lang] for example in examples["translation"]] ... targets = [example[target_lang] for example in examples["translation"]] -... model_inputs = tokenizer(inputs, max_length=128, truncation=True) - -... with tokenizer.as_target_tokenizer(): -... labels = tokenizer(targets, max_length=128, truncation=True) - -... model_inputs["labels"] = labels["input_ids"] +... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) ... return model_inputs ``` diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index a4deab8041b2..e3ea20e85770 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -551,11 +551,10 @@ def tokenization_fn(examples, max_target_length): targets = captions model_inputs = {} - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer( - targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" - ) + + labels = tokenizer( + targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" + ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn( labels["input_ids"], model.config.pad_token_id, model.config.decoder_start_token_id diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index a1b5fc37e24c..019964e2a5b2 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -586,10 +586,13 @@ def preprocess_function(examples): ) # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer( - targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" - ) + labels = tokenizer( + text_target=targets, + max_length=max_target_length, + padding="max_length", + truncation=True, + return_tensors="np", + ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn( diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 6249613313c8..70bf3272b628 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -453,9 +453,8 @@ def preprocess_function(examples): inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) + # Tokenize targets with text_target=... + labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. @@ -479,9 +478,8 @@ def preprocess_validation_function(examples): return_overflowing_tokens=True, return_offsets_mapping=True, ) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 33aa2bfd2c0c..bb0bdfa74327 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -522,9 +522,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index c38925743090..16f24cbdabbf 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -470,9 +470,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index a53fd039dc19..747234a1da1e 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -443,9 +443,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 7227681d0589..6db6e11c500f 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -452,9 +452,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py index 461bfbec9ae3..7573893629c6 100644 --- a/examples/research_projects/tapex/run_wikisql_with_tapex.py +++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py @@ -437,13 +437,12 @@ def _convert_table_types(_table): table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True ) - with tokenizer.as_target_tokenizer(): - labels = tokenizer( - answer=[", ".join(answer) for answer in answers], - max_length=max_target_length, - padding=padding, - truncation=True, - ) + labels = tokenizer( + answer=[", ".join(answer) for answer in answers], + max_length=max_target_length, + padding=padding, + truncation=True, + ) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py index 1750adc546f0..7ffa8f5f91cc 100644 --- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py +++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py @@ -413,13 +413,12 @@ def preprocess_tableqa_function(examples, is_training=False): table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True ) - with tokenizer.as_target_tokenizer(): - labels = tokenizer( - answer=[", ".join(answer) for answer in answers], - max_length=max_target_length, - padding=padding, - truncation=True, - ) + labels = tokenizer( + answer=[", ".join(answer) for answer in answers], + max_length=max_target_length, + padding=padding, + truncation=True, + ) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index fdeb3f624985..9994f2feb91b 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -503,9 +503,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index b85b5bcb71b7..5be4728a814c 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -457,9 +457,8 @@ def preprocess_function(examples): inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index c11c8782f227..b67b82fb7a58 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -115,10 +115,8 @@ class M2M100Tokenizer(PreTrainedTokenizer): >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro") >>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" - >>> model_inputs = tokenizer(src_text, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids - >>> model(**model_inputs, labels=labels) # should work + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> model(**model_inputs) # should work ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 6967a675813c..66eb5a44c5bf 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -111,10 +111,7 @@ class MarianTokenizer(PreTrainedTokenizer): >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."] >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional - >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True) - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(tgt_texts, return_tensors="pt", padding=True) - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True) # keys [input_ids, attention_mask, labels]. >>> outputs = model(**inputs) # should work diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 431908b0e97c..65460746425f 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -68,10 +68,7 @@ class MBartTokenizer(PreTrainedTokenizer): >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO") >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" - >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(expected_translation_romanian, return_tensors="pt") - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt") ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 85937a5fd8ed..8bf75ebe59c0 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -81,10 +81,7 @@ class MBartTokenizerFast(PreTrainedTokenizerFast): ... ) >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" - >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(expected_translation_romanian, return_tensors="pt") - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt") ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 274b7f95f829..707a97734927 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -101,10 +101,8 @@ class MBart50Tokenizer(PreTrainedTokenizer): >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") >>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" - >>> model_inputs = tokenizer(src_text, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids - >>> # model(**model_inputs, labels=labels) should work + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> # model(**model_inputs) should work ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py index b0f41a0fd719..1ab8ff06e260 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -97,10 +97,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") >>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" - >>> model_inputs = tokenizer(src_text, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(tgt_text, return_tensors="pt").input_ids - >>> # model(**model_inputs, labels=labels) should work + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> # model(**model_inputs) should work ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py index 841d2069e65c..4f2fa5b9fb39 100644 --- a/src/transformers/models/mt5/modeling_flax_mt5.py +++ b/src/transformers/models/mt5/modeling_flax_mt5.py @@ -57,8 +57,7 @@ class FlaxMT5Model(FlaxT5Model): >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="np") - >>> with tokenizer.as_target_tokenizer(): - ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids + >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids) >>> hidden_states = outputs.last_hidden_state @@ -84,8 +83,7 @@ class FlaxMT5EncoderModel(FlaxT5EncoderModel): >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="np") - >>> with tokenizer.as_target_tokenizer(): - ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids + >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids >>> outputs = model(input_ids=inputs["input_ids"]) >>> hidden_states = outputs.last_hidden_state @@ -111,8 +109,7 @@ class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration): >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="np") - >>> with tokenizer.as_target_tokenizer(): - ... decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids + >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids) >>> logits = outputs.logits diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 8c19a63eded3..c562b011522d 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -40,8 +40,7 @@ class MT5Model(T5Model): >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(summary, return_tensors="pt") + >>> labels = tokenizer(text_target=summary, return_tensors="pt") >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) >>> hidden_states = outputs.last_hidden_state @@ -73,11 +72,9 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration): >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small") >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." - >>> inputs = tokenizer(article, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(summary, return_tensors="pt") + >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt") - >>> outputs = model(**inputs, labels=labels["input_ids"]) + >>> outputs = model(**inputs) >>> loss = outputs.loss ```""" diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py index 2808b8421a16..71aa0bb66a7a 100644 --- a/src/transformers/models/mt5/modeling_tf_mt5.py +++ b/src/transformers/models/mt5/modeling_tf_mt5.py @@ -40,8 +40,7 @@ class TFMT5Model(TFT5Model): >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="tf") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(summary, return_tensors="tf") + >>> labels = tokenizer(text_target=summary, return_tensors="tf") >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) >>> hidden_states = outputs.last_hidden_state @@ -64,11 +63,9 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration): >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small") >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." - >>> inputs = tokenizer(article, return_tensors="tf") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(summary, return_tensors="tf") + >>> inputs = tokenizer(article, text_target=summary, return_tensors="tf") - >>> outputs = model(**inputs, labels=labels["input_ids"]) + >>> outputs = model(**inputs) >>> loss = outputs.loss ```""" diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index d0c0ce8b74ec..6a326fd3ca10 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -66,10 +66,7 @@ class NllbTokenizer(PreTrainedTokenizer): ... ) >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." - >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(expected_translation_french, return_tensors="pt") - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt") ``` Args: diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 404b7c093e05..1afe27f43b4e 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -79,10 +79,7 @@ class NllbTokenizerFast(PreTrainedTokenizerFast): ... ) >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." - >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(expected_translation_french, return_tensors="pt") - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt") ``` Args: diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index 93e109473bdf..411df996926a 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -152,10 +152,7 @@ class PLBartTokenizer(PreTrainedTokenizer): >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX") >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" >>> expected_translation_english = "Returns the maximum value of a b c." - >>> inputs = tokenizer(example_python_phrase, return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... labels = tokenizer(expected_translation_english, return_tensors="pt") - >>> inputs["labels"] = labels["input_ids"] + >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt") ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 1d6a62b2013d..41af393c6710 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -818,8 +818,7 @@ def forward( >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt") + >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt") >>> input_ids = inputs["input_ids"] >>> labels = targets["input_ids"] >>> outputs = model(input_ids=input_ids, labels=labels) @@ -1287,8 +1286,7 @@ def forward( >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt") - >>> with tokenizer.as_target_tokenizer(): - ... targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt") + >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt") >>> input_ids = inputs["input_ids"] >>> labels = targets["input_ids"] >>> outputs = model(input_ids=input_ids, labels=labels) diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index 485c2c448373..5b6ec67e6bf8 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -105,17 +105,16 @@ def prepare_seq2seq_batch( if tgt_texts is None: return model_inputs # Process tgt_texts - with self.as_target_tokenizer(): - if max_target_length is None: - max_target_length = self.current_tokenizer.model_max_length - labels = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=truncation, - **kwargs, - ) + if max_target_length is None: + max_target_length = self.current_tokenizer.model_max_length + labels = self( + text_target=tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) model_inputs["labels"] = labels["input_ids"] return model_inputs diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index b8ab1e09c445..a945b1abe58f 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3658,14 +3658,17 @@ def prepare_seq2seq_batch( # docstyle-ignore formatted_warning = """ `prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular -`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare -your targets. +`__call__` method to prepare your inputs and targets. Here is a short example: +model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...) + +If you either need to use different keyword arguments for the source and target texts, you should do two calls like +this: + model_inputs = tokenizer(src_texts, ...) -with tokenizer.as_target_tokenizer(): - labels = tokenizer(tgt_texts, ...) +labels = tokenizer(text_target=tgt_texts, ...) model_inputs["labels"] = labels["input_ids"] See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice. @@ -3691,16 +3694,15 @@ def prepare_seq2seq_batch( # Process tgt_texts if max_target_length is None: max_target_length = max_length - with self.as_target_tokenizer(): - labels = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=truncation, - **kwargs, - ) + labels = self( + text_target=tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) model_inputs["labels"] = labels["input_ids"] return model_inputs diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index b8e216e69ba2..24ea6e1e5cd9 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -112,14 +112,13 @@ def test_prepare_batch_empty_target_text(self): self.assertNotIn("decoder_attention_mask", batch) @require_torch - def test_as_target_tokenizer_target_length(self): + def test_tokenizer_as_target_length(self): tgt_text = [ "Summary of the text.", "Another summary.", ] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt") + targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt") self.assertEqual(32, targets["input_ids"].shape[1]) @require_torch @@ -140,8 +139,7 @@ def test_special_tokens(self): ] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: inputs = tokenizer(src_text, return_tensors="pt") - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text, return_tensors="pt") + targets = tokenizer(text_target=tgt_text, return_tensors="pt") input_ids = inputs["input_ids"] labels = targets["input_ids"] self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py index 70cfa40ef919..85057c5278bb 100644 --- a/tests/models/byt5/test_tokenization_byt5.py +++ b/tests/models/byt5/test_tokenization_byt5.py @@ -152,10 +152,9 @@ def test_max_length_integration(self): "Summary of the text.", "Another summary.", ] - with tokenizer.as_target_tokenizer(): - targets = tokenizer( - tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK - ) + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK + ) self.assertEqual(32, targets["input_ids"].shape[1]) def test_eos_in_input(self): @@ -167,12 +166,10 @@ def test_eos_in_input(self): expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1] # fmt: on - batch = tokenizer(src_text) - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text) + batch = tokenizer(src_text, text_target=tgt_text) self.assertEqual(expected_src_tokens, batch["input_ids"][0]) - self.assertEqual(expected_tgt_tokens, targets["input_ids"][0]) + self.assertEqual(expected_tgt_tokens, batch["labels"][0]) # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab def test_save_and_load_tokenizer(self): diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index 0e016d523b5c..6ae27082cceb 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -80,8 +80,9 @@ def test_max_length_integration(self): "What's the weater?", "It's about 25 degrees.", ] - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt") + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt" + ) self.assertEqual(32, targets["input_ids"].shape[1]) # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index 729deb6cd486..ca8349d94016 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -187,9 +187,7 @@ def test_batch_fairseq_parity(self): self.tokenizer.src_lang = "en" self.tokenizer.tgt_lang = "fr" - batch = self.tokenizer(self.src_text, padding=True, return_tensors="pt") - with self.tokenizer.as_target_tokenizer(): - batch["labels"] = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt").input_ids + batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt") batch["decoder_input_ids"] = shift_tokens_right( batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id @@ -217,17 +215,19 @@ def test_src_lang_setter(self): self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) @require_torch - def test_as_target_tokenizer(self): + def test_tokenizer_target_mode(self): self.tokenizer.tgt_lang = "mr" - with self.tokenizer.as_target_tokenizer(): - self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")]) - self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) + self.tokenizer._switch_to_target_mode() + self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")]) + self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) + self.tokenizer._switch_to_input_mode() self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)]) self.tokenizer.tgt_lang = "zh" - with self.tokenizer.as_target_tokenizer(): - self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")]) - self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) + self.tokenizer._switch_to_target_mode() + self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")]) + self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id]) + self.tokenizer._switch_to_input_mode() self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)]) @require_torch diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index e454f981b4a0..6ca951e37aed 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -438,10 +438,7 @@ def test_forward(self): src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."] expected_ids = [38, 121, 14, 697, 38848, 0] - model_inputs = self.tokenizer(src, return_tensors="pt").to(torch_device) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(tgt, return_tensors="pt") - model_inputs["labels"] = targets["input_ids"].to(torch_device) + model_inputs = self.tokenizer(src, text_target=tgt, return_tensors="pt").to(torch_device) self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist()) diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py index 2cbc0b0a3fe7..6a079036bb6d 100644 --- a/tests/models/marian/test_tokenization_marian.py +++ b/tests/models/marian/test_tokenization_marian.py @@ -145,9 +145,8 @@ def test_tokenizer_integration_seperate_vocabs(self): src_ids = tokenizer(source_text).input_ids self.assertListEqual(src_ids, expected_src_ids) - with tokenizer.as_target_tokenizer(): - target_ids = tokenizer(target_text).input_ids - self.assertListEqual(target_ids, expected_target_ids) + target_ids = tokenizer(text_target=target_text).input_ids + self.assertListEqual(target_ids, expected_target_ids) decoded = tokenizer.decode(target_ids, skip_special_tokens=True) self.assertEqual(decoded, target_text) diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py index e80531051b65..f65662dbe247 100644 --- a/tests/models/mbart/test_tokenization_mbart.py +++ b/tests/models/mbart/test_tokenization_mbart.py @@ -265,33 +265,27 @@ def test_special_tokens_unaffacted_by_save_load(self): @require_torch def test_batch_fairseq_parity(self): - batch = self.tokenizer(self.src_text, padding=True) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt") - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist() + batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt") + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 - assert batch.input_ids[1][-2:] == [2, EN_CODE] - assert batch.decoder_input_ids[1][0] == RO_CODE + assert batch.input_ids[1][-2:].tolist() == [2, EN_CODE] + assert batch.decoder_input_ids[1][0].tolist() == RO_CODE assert batch.decoder_input_ids[1][-1] == 2 - assert labels[1][-2:].tolist() == [2, RO_CODE] + assert batch.labels[1][-2:].tolist() == [2, RO_CODE] @require_torch def test_enro_tokenizer_prepare_batch(self): batch = self.tokenizer( - self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" + self.src_text, + text_target=self.tgt_text, + padding=True, + truncation=True, + max_length=len(self.expected_src_tokens), + return_tensors="pt", ) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer( - self.tgt_text, - padding=True, - truncation=True, - max_length=len(self.expected_src_tokens), - return_tensors="pt", - ) - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) + + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) self.assertIsInstance(batch, BatchEncoding) @@ -306,8 +300,9 @@ def test_enro_tokenizer_prepare_batch(self): def test_seq2seq_max_length(self): batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") + targets = self.tokenizer( + text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt" + ) labels = targets["input_ids"] batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py index 5a65d8856656..d10d51df907c 100644 --- a/tests/models/mbart50/test_tokenization_mbart50.py +++ b/tests/models/mbart50/test_tokenization_mbart50.py @@ -256,35 +256,27 @@ def test_special_tokens_unaffacted_by_save_load(self): @require_torch def test_batch_fairseq_parity(self): - batch = self.tokenizer(self.src_text, padding=True) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt") - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist() - labels = labels.tolist() + batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt") + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 assert batch.input_ids[1][0] == EN_CODE assert batch.input_ids[1][-1] == 2 - assert labels[1][0] == RO_CODE - assert labels[1][-1] == 2 - assert batch.decoder_input_ids[1][:2] == [2, RO_CODE] + assert batch.labels[1][0] == RO_CODE + assert batch.labels[1][-1] == 2 + assert batch.decoder_input_ids[1][:2].tolist() == [2, RO_CODE] @require_torch def test_tokenizer_prepare_batch(self): batch = self.tokenizer( - self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" + self.src_text, + text_target=self.tgt_text, + padding=True, + truncation=True, + max_length=len(self.expected_src_tokens), + return_tensors="pt", ) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer( - self.tgt_text, - padding=True, - truncation=True, - max_length=len(self.expected_src_tokens), - return_tensors="pt", - ) - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) self.assertIsInstance(batch, BatchEncoding) @@ -299,8 +291,9 @@ def test_tokenizer_prepare_batch(self): def test_seq2seq_max_target_length(self): batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") + targets = self.tokenizer( + text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt" + ) labels = targets["input_ids"] batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py index ad3fad67c91d..71e83fba0e16 100644 --- a/tests/models/mvp/test_tokenization_mvp.py +++ b/tests/models/mvp/test_tokenization_mvp.py @@ -112,14 +112,13 @@ def test_prepare_batch_empty_target_text(self): self.assertNotIn("decoder_attention_mask", batch) @require_torch - def test_as_target_tokenizer_target_length(self): + def test_tokenizer_as_target_length(self): tgt_text = [ "Summary of the text.", "Another summary.", ] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt") + targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt") self.assertEqual(32, targets["input_ids"].shape[1]) @require_torch @@ -139,11 +138,9 @@ def test_special_tokens(self): "Summary of the text.", ] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: - inputs = tokenizer(src_text, return_tensors="pt") - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text, return_tensors="pt") + inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") input_ids = inputs["input_ids"] - labels = targets["input_ids"] + labels = inputs["labels"] self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 10575084a727..88feb0ae533d 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -373,19 +373,15 @@ def test_special_tokens_unaffacted_by_save_load(self): @require_torch def test_enro_tokenizer_prepare_batch(self): batch = self.tokenizer( - self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" + self.src_text, + target_text=self.tgt_text, + padding=True, + truncation=True, + max_length=len(self.expected_src_tokens), + return_tensors="pt", ) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer( - self.tgt_text, - padding=True, - truncation=True, - max_length=len(self.expected_src_tokens), - return_tensors="pt", - ) - labels = targets["input_ids"] batch["decoder_input_ids"] = shift_tokens_right( - labels, self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"] + batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"] ) self.assertIsInstance(batch, BatchEncoding) @@ -401,8 +397,9 @@ def test_enro_tokenizer_prepare_batch(self): def test_seq2seq_max_length(self): batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") + targets = self.tokenizer( + text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt" + ) labels = targets["input_ids"] batch["decoder_input_ids"] = shift_tokens_right( labels, diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index d473725f9ae9..de2886a5e120 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -109,10 +109,9 @@ def test_large_seq2seq_truncation(self): src_texts = ["This is going to be way too long." * 150, "short example"] tgt_texts = ["not super long but more than 5 tokens", "tiny"] batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt") - with self._large_tokenizer.as_target_tokenizer(): - targets = self._large_tokenizer( - tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" - ) + targets = self._large_tokenizer( + text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" + ) assert batch.input_ids.shape == (2, 1024) assert batch.attention_mask.shape == (2, 1024) @@ -174,10 +173,9 @@ def test_large_seq2seq_truncation(self): src_texts = ["This is going to be way too long." * 1000, "short example"] tgt_texts = ["not super long but more than 5 tokens", "tiny"] batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt") - with self._large_tokenizer.as_target_tokenizer(): - targets = self._large_tokenizer( - tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" - ) + targets = self._large_tokenizer( + text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt" + ) assert batch.input_ids.shape == (2, 4096) assert batch.attention_mask.shape == (2, 4096) diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index ca61e9c856f1..3c7a67bcd2b9 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -146,10 +146,9 @@ def test_max_length_integration(self): "Summary of the text.", "Another summary.", ] - with tokenizer.as_target_tokenizer(): - targets = tokenizer( - tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK - ) + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK + ) self.assertEqual(32, targets["input_ids"].shape[1]) # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py index 9aed6040f3fd..2ce7cafbda6e 100644 --- a/tests/models/plbart/test_tokenization_plbart.py +++ b/tests/models/plbart/test_tokenization_plbart.py @@ -299,33 +299,26 @@ def test_special_tokens_unaffacted_by_save_load(self): @require_torch def test_batch_fairseq_parity(self): - batch = self.tokenizer(self.src_text, padding=True) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt") - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist() + batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt") + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4 - self.assertEqual(batch.input_ids[1][-2:], [2, PYTHON_CODE]) + self.assertEqual(batch.input_ids[1][-2:].tolist(), [2, PYTHON_CODE]) self.assertEqual(batch.decoder_input_ids[1][0], EN_CODE) self.assertEqual(batch.decoder_input_ids[1][-1], 2) - self.assertEqual(labels[1][-2:].tolist(), [2, EN_CODE]) + self.assertEqual(batch.labels[1][-2:].tolist(), [2, EN_CODE]) @require_torch def test_python_en_tokenizer_prepare_batch(self): batch = self.tokenizer( - self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt" + self.src_text, + text_target=self.tgt_text, + padding=True, + truncation=True, + max_length=len(self.expected_src_tokens), + return_tensors="pt", ) - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer( - self.tgt_text, - padding=True, - truncation=True, - max_length=len(self.expected_src_tokens), - return_tensors="pt", - ) - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) + batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id) self.assertIsInstance(batch, BatchEncoding) @@ -340,8 +333,9 @@ def test_python_en_tokenizer_prepare_batch(self): def test_seq2seq_max_length(self): batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt") - with self.tokenizer.as_target_tokenizer(): - targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt") + targets = self.tokenizer( + text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt" + ) labels = targets["input_ids"] batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index 1c0fde222cdb..28d85c77c97c 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -210,10 +210,9 @@ def test_max_length(self): "Summary of the text.", "Another summary.", ] - with tokenizer.as_target_tokenizer(): - targets = tokenizer( - tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK - ) + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK + ) self.assertEqual(32, targets["input_ids"].shape[1]) def test_outputs_not_longer_than_maxlen(self): @@ -235,12 +234,10 @@ def test_eos_in_input(self): expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1] expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1] - batch = tokenizer(src_text) - with tokenizer.as_target_tokenizer(): - targets = tokenizer(tgt_text) + batch = tokenizer(src_text, text_target=tgt_text) self.assertEqual(expected_src_tokens, batch["input_ids"][0]) - self.assertEqual(expected_tgt_tokens, targets["input_ids"][0]) + self.assertEqual(expected_tgt_tokens, batch["labels"][0]) def test_token_type_ids(self): src_text_1 = ["A first paragraph for summarization."] diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py index c959b780215b..dec0f507ed3c 100644 --- a/tests/models/tapex/test_tokenization_tapex.py +++ b/tests/models/tapex/test_tokenization_tapex.py @@ -859,9 +859,8 @@ def test_tokenizer_as_target(self): tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base") answer_text = "tapex is a good model!" expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2] - with tokenizer.as_target_tokenizer(): - answer_encoding = tokenizer(answer=answer_text) - self.assertListEqual(answer_encoding.input_ids, expected_src_tokens) + answer_encoding = tokenizer(answer=answer_text) + self.assertListEqual(answer_encoding.input_ids, expected_src_tokens) @slow def test_tokenizer_lower_case(self): @@ -870,23 +869,21 @@ def test_tokenizer_lower_case(self): answer_text = "Beijing, London, Paris" answer_text_lower = "beijing, london, paris" - with cased_tokenizer.as_target_tokenizer(): - with uncased_tokenizer.as_target_tokenizer(): - self.assertNotEqual( - cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids - ) - self.assertEqual( - cased_tokenizer(answer=answer_text_lower).input_ids, - uncased_tokenizer(answer=answer_text).input_ids, - ) - # batched encoding assert - self.assertNotEqual( - cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids - ) - self.assertEqual( - cased_tokenizer(answer=[answer_text_lower]).input_ids, - uncased_tokenizer(answer=[answer_text]).input_ids, - ) + self.assertNotEqual( + cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids + ) + self.assertEqual( + cased_tokenizer(answer=answer_text_lower).input_ids, + uncased_tokenizer(answer=answer_text).input_ids, + ) + # batched encoding assert + self.assertNotEqual( + cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids + ) + self.assertEqual( + cased_tokenizer(answer=[answer_text_lower]).input_ids, + uncased_tokenizer(answer=[answer_text]).input_ids, + ) # test input encoding lowercase question = "Greece held its last Summer Olympics in 2004" table_dict = { From b2a8e18fd7d9618427bf817b06fdfd68129a326c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Thu, 28 Jul 2022 13:05:19 -0400 Subject: [PATCH 07/10] Fix tests --- docs/source/en/model_doc/marian.mdx | 1 + .../models/tapex/tokenization_tapex.py | 25 +++---------------- src/transformers/tokenization_utils_base.py | 19 +++++++------- tests/models/dpr/test_tokenization_dpr.py | 2 ++ tests/models/nllb/test_tokenization_nllb.py | 2 +- 5 files changed, 18 insertions(+), 31 deletions(-) diff --git a/docs/source/en/model_doc/marian.mdx b/docs/source/en/model_doc/marian.mdx index b415e9413c40..9d0a9ff2576a 100644 --- a/docs/source/en/model_doc/marian.mdx +++ b/docs/source/en/model_doc/marian.mdx @@ -155,6 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char ## MarianTokenizer [[autodoc]] MarianTokenizer + - build_inputs_with_special_tokens ## MarianModel diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py index d9afd160d38d..7c0725ffe7c1 100644 --- a/src/transformers/models/tapex/tokenization_tapex.py +++ b/src/transformers/models/tapex/tokenization_tapex.py @@ -62,12 +62,6 @@ class TapexTruncationStrategy(ExplicitEnum): DROP_ROWS_TO_FIT = "drop_rows_to_fit" -class TokenizerStrategy(ExplicitEnum): - - TOKENIZE_SOURCE = "tokenize_source" - TOKENIZE_TARGET = "tokenize_target" - - TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" add_special_tokens (`bool`, *optional*, defaults to `True`): Whether or not to encode the sequences with the special tokens relative to their model. @@ -340,9 +334,6 @@ def __init__( self.max_cell_length = max_cell_length self.table_linearize = IndexedRowTableLinearize() - # property to decide using which call function - self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: @@ -554,9 +545,7 @@ def __call__( Optionally, the corresponding answer to the questions as supervision. """ - if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE: - if table is None: - raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.") + if table is not None: return self.source_call_func( table=table, query=query, @@ -577,9 +566,7 @@ def __call__( verbose=verbose, **kwargs, ) - else: - if answer is None: - raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.") + elif answer is not None: return self.target_call_func( answer=answer, add_special_tokens=add_special_tokens, @@ -598,6 +585,8 @@ def __call__( verbose=verbose, **kwargs, ) + else: + raise ValueError("You need to provide either a `table` or an `answer`.") def source_call_func( self, @@ -1329,12 +1318,6 @@ def _target_encode_plus( verbose=verbose, ) - def _switch_to_input_mode(self): - self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE - - def _switch_to_target_mode(self): - self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET - def prepare_table_query( self, table, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a945b1abe58f..7e259fce9036 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3694,15 +3694,16 @@ def prepare_seq2seq_batch( # Process tgt_texts if max_target_length is None: max_target_length = max_length - labels = self( - text_target=tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=truncation, - **kwargs, - ) + with self.as_target_tokenizer(): + labels = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) model_inputs["labels"] = labels["input_ids"] return model_inputs diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 2870e0bcf352..1a085d86234d 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -14,6 +14,8 @@ # limitations under the License. +import unittest + from transformers import ( DPRContextEncoderTokenizer, DPRContextEncoderTokenizerFast, diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 88feb0ae533d..d77b101fa766 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -374,7 +374,7 @@ def test_special_tokens_unaffacted_by_save_load(self): def test_enro_tokenizer_prepare_batch(self): batch = self.tokenizer( self.src_text, - target_text=self.tgt_text, + text_target=self.tgt_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), From 1cabae4f30bf53c14e48e2e4a165ed3e60de3977 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Thu, 28 Jul 2022 13:10:25 -0400 Subject: [PATCH 08/10] Fix quality --- tests/models/dpr/test_tokenization_dpr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 1a085d86234d..8ad2fea09c8b 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import unittest - from transformers import ( DPRContextEncoderTokenizer, DPRContextEncoderTokenizerFast, From 13251259ca61a86cee7a86906311e11896c77310 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 29 Jul 2022 07:30:13 -0400 Subject: [PATCH 09/10] Update examples/flax/image-captioning/run_image_captioning_flax.py Co-authored-by: amyeroberts --- examples/flax/image-captioning/run_image_captioning_flax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index e3ea20e85770..53556fdc806b 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -553,7 +553,7 @@ def tokenization_fn(examples, max_target_length): model_inputs = {} labels = tokenizer( - targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" + text_target=targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn( From d9394918d0dd2302828b3e6908c3f55faf0077f3 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Fri, 29 Jul 2022 07:39:13 -0400 Subject: [PATCH 10/10] Style --- examples/flax/image-captioning/run_image_captioning_flax.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index a0e8d19b0c2e..4552defb8efc 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -554,7 +554,11 @@ def tokenization_fn(examples, max_target_length): model_inputs = {} labels = tokenizer( - text_target=targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" + text_target=targets, + max_length=max_target_length, + padding="max_length", + truncation=True, + return_tensors="np", ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn(