From a6697baaa2532ab21c694c4cbef6977ebb2fcb7c Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 17 Nov 2020 10:31:16 -0500 Subject: [PATCH 1/5] Tokenizers should be framework agnostic --- docs/source/model_doc/marian.rst | 4 ++-- docs/source/model_doc/mbart.rst | 4 ++-- docs/source/model_doc/pegasus.rst | 2 +- .../tuner007/pegasus_paraphrase/README.md | 2 +- model_cards/tuner007/pegasus_qa/README.md | 2 +- scripts/fsmt/fsmt-make-super-tiny-model.py | 2 +- scripts/fsmt/fsmt-make-tiny-model.py | 2 +- .../models/bart/tokenization_bart.py | 4 ++-- .../models/bart/tokenization_bart_fast.py | 4 ++-- .../models/fsmt/tokenization_fsmt.py | 2 +- .../models/marian/modeling_marian.py | 2 +- .../models/marian/tokenization_marian.py | 4 ++-- .../models/mbart/modeling_mbart.py | 2 +- .../models/mbart/tokenization_mbart.py | 4 ++-- .../models/mbart/tokenization_mbart_fast.py | 4 ++-- .../models/pegasus/modeling_pegasus.py | 2 +- .../models/pegasus/tokenization_pegasus.py | 2 +- .../pegasus/tokenization_pegasus_fast.py | 2 +- .../models/rag/tokenization_rag.py | 2 +- src/transformers/tokenization_utils.py | 2 +- src/transformers/tokenization_utils_base.py | 2 +- tests/test_modeling_marian.py | 20 ++++++++++++------- tests/test_modeling_mbart.py | 12 +++++++---- tests/test_tokenization_common.py | 2 +- tests/test_tokenization_mbart.py | 11 +++++----- tests/test_tokenization_pegasus.py | 4 +++- 26 files changed, 58 insertions(+), 47 deletions(-) diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst index 58d9f4c0a41ada..da54488a75cd80 100644 --- a/docs/source/model_doc/marian.rst +++ b/docs/source/model_doc/marian.rst @@ -78,7 +78,7 @@ require 3 character language codes: tokenizer = MarianTokenizer.from_pretrained(model_name) print(tokenizer.supported_language_codes) model = MarianMTModel.from_pretrained(model_name) - translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text)) + translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")) tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] # ["c'est une phrase en anglais que nous voulons traduire en français", # 'Isto deve ir para o português.', @@ -150,7 +150,7 @@ Example of translating english to many romance languages, using old-style 2 char print(tokenizer.supported_language_codes) model = MarianMTModel.from_pretrained(model_name) - translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text)) + translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")) tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.', 'Y esto al español'] diff --git a/docs/source/model_doc/mbart.rst b/docs/source/model_doc/mbart.rst index 9d3aab93fd3590..26b00cbd411c04 100644 --- a/docs/source/model_doc/mbart.rst +++ b/docs/source/model_doc/mbart.rst @@ -44,7 +44,7 @@ the sequences for sequence-to-sequence fine-tuning. example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" - batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian) + batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt") model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass - Generation @@ -58,7 +58,7 @@ the sequences for sequence-to-sequence fine-tuning. model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro") article = "UN Chief Says There Is No Military Solution in Syria" - batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX") + batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt") translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"]) translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria" diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst index a652ce0851ef4f..290266051e2731 100644 --- a/docs/source/model_doc/pegasus.rst +++ b/docs/source/model_doc/pegasus.rst @@ -78,7 +78,7 @@ Usage Example torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) - batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device) + batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device) translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers." diff --git a/model_cards/tuner007/pegasus_paraphrase/README.md b/model_cards/tuner007/pegasus_paraphrase/README.md index f83d6d29e1d29b..311d8de61c3326 100644 --- a/model_cards/tuner007/pegasus_paraphrase/README.md +++ b/model_cards/tuner007/pegasus_paraphrase/README.md @@ -11,7 +11,7 @@ tokenizer = PegasusTokenizer.from_pretrained(model_name) model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device) def get_response(input_text,num_return_sequences): - batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device) + batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device) translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text diff --git a/model_cards/tuner007/pegasus_qa/README.md b/model_cards/tuner007/pegasus_qa/README.md index 3d3831bfc97c69..bc9397225d7b83 100644 --- a/model_cards/tuner007/pegasus_qa/README.md +++ b/model_cards/tuner007/pegasus_qa/README.md @@ -12,7 +12,7 @@ model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_dev def get_answer(question, context): input_text = "question: %s text: %s" % (question,context) - batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest').to(torch_device) + batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', return_tensors="pt").to(torch_device) translated = model.generate(**batch) tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) return tgt_text[0] diff --git a/scripts/fsmt/fsmt-make-super-tiny-model.py b/scripts/fsmt/fsmt-make-super-tiny-model.py index f1742a4dc4ffaf..b5ec17c65f4834 100755 --- a/scripts/fsmt/fsmt-make-super-tiny-model.py +++ b/scripts/fsmt/fsmt-make-super-tiny-model.py @@ -58,7 +58,7 @@ print(f"num of params {tiny_model.num_parameters()}") # Test -batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) +batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"], return_tensors="pt") outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) diff --git a/scripts/fsmt/fsmt-make-tiny-model.py b/scripts/fsmt/fsmt-make-tiny-model.py index 924eb0b63c93d6..ba4786fba3b639 100755 --- a/scripts/fsmt/fsmt-make-tiny-model.py +++ b/scripts/fsmt/fsmt-make-tiny-model.py @@ -29,7 +29,7 @@ print(f"num of params {tiny_model.num_parameters()}") # Test -batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) +batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"], return_tensors="pt") outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 24046b39676a33..071492238ae5c0 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -61,7 +61,7 @@ def prepare_seq2seq_batch( max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", - return_tensors: str = "None", + return_tensors: str = None, truncation=True, **kwargs, ) -> BatchEncoding: @@ -91,7 +91,7 @@ def prepare_seq2seq_batch( maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 69c16c2be4cfe8..43f226f3103217 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -56,7 +56,7 @@ def prepare_seq2seq_batch( max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", - return_tensors: str = "None", + return_tensors: str = None, truncation=True, **kwargs, ) -> BatchEncoding: @@ -86,7 +86,7 @@ def prepare_seq2seq_batch( maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index 083906bf3031e6..0c11a7a64db9a8 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -491,7 +491,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = "pt", + return_tensors: str = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index b02e4599fdc6bc..637529c1168414 100644 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -41,7 +41,7 @@ class MarianMTModel(BartForConditionalGeneration): >>> model = MarianMTModel.from_pretrained(mname) >>> tok = MarianTokenizer.from_pretrained(mname) - >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text]) # don't need tgt_text for inference + >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt") # don't need tgt_text for inference >>> gen = model.generate(**batch) # for forward pass: model(**batch) >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the bus stop ?" diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 8af0c40f4f769b..cb81ddfe309aae 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -70,7 +70,7 @@ class MarianTokenizer(PreTrainedTokenizer): >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de') >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."] >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional - >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts) + >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt") >>> # keys [input_ids, attention_mask, labels]. >>> # model(**batch) should work """ @@ -175,7 +175,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = "pt", + return_tensors: str = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 7f92929125321a..2978a250dcb9f3 100644 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -22,7 +22,7 @@ class MBartForConditionalGeneration(BartForConditionalGeneration): >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro") >>> article = "UN Chief Says There Is No Military Solution in Syria" - >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article]) + >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], return_tensors="pt") >>> translated_tokens = model.generate(**batch) >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria" diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index bb5f604d6b4c5a..98448fe66168ad 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -81,7 +81,7 @@ class MBartTokenizer(XLMRobertaTokenizer): >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> batch: dict = tokenizer.prepare_seq2seq_batch( - ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian + ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt" ... ) """ @@ -183,7 +183,7 @@ def prepare_seq2seq_batch( max_target_length: Optional[int] = None, truncation: bool = True, padding: str = "longest", - return_tensors: str = "pt", + return_tensors: str = None, add_prefix_space: bool = False, # ignored **kwargs, ) -> BatchEncoding: diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 27243c55afa5f9..14b6e4919b7962 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -89,7 +89,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> batch: dict = tokenizer.prepare_seq2seq_batch( - ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian + ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt" ... ) """ @@ -181,7 +181,7 @@ def prepare_seq2seq_batch( max_target_length: Optional[int] = None, truncation: bool = True, padding: str = "longest", - return_tensors: str = "pt", + return_tensors: str = None, **kwargs, ) -> BatchEncoding: if max_length is None: diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 3d721d5ae883b0..64515c7a8ba733 100644 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -38,7 +38,7 @@ class PegasusForConditionalGeneration(BartForConditionalGeneration): >>> model = PegasusForConditionalGeneration.from_pretrained(mname) >>> tok = PegasusTokenizer.from_pretrained(mname) - >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE]) # don't need tgt_text for inference + >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE], return_tensors="pt") # don't need tgt_text for inference >>> gen = model.generate(**batch) # for forward pass: model(**batch) >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True) >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers." diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 170eb37e5f9ff7..5728338276d26c 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -134,7 +134,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = "pt", + return_tensors: str = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py index 30fb45e0be96ef..e221eb4b54b018 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py +++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py @@ -95,7 +95,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = "pt", + return_tensors: str = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index c3deffc98654f1..766d04662d71cd 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -71,7 +71,7 @@ def prepare_seq2seq_batch( max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", - return_tensors: str = "np", + return_tensors: str = None, truncation=True, **kwargs, ) -> BatchEncoding: diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 0f6f8b473bffc0..c9f63eba3bbd7c 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -797,7 +797,7 @@ def prepare_seq2seq_batch( maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 1f492b06e92053..25bd051f827ab6 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1455,7 +1455,7 @@ def all_special_ids(self) -> List[int]: maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py index cdbb92f5047cac..dc50daa9a78406 100644 --- a/tests/test_modeling_marian.py +++ b/tests/test_modeling_marian.py @@ -132,9 +132,9 @@ def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs): self.assertListEqual(self.expected_text, generated_words) def translate_src_text(self, **tokenizer_kwargs): - model_inputs = self.tokenizer.prepare_seq2seq_batch(src_texts=self.src_text, **tokenizer_kwargs).to( - torch_device - ) + model_inputs = self.tokenizer.prepare_seq2seq_batch( + src_texts=self.src_text, return_tensors="pt", **tokenizer_kwargs + ).to(torch_device) self.assertEqual(self.model.device, model_inputs.input_ids.device) generated_ids = self.model.generate( model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128 @@ -151,7 +151,9 @@ def test_forward(self): src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."] expected_ids = [38, 121, 14, 697, 38848, 0] - model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt).to(torch_device) + model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt, return_tensors="pt").to( + torch_device + ) self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist()) @@ -171,12 +173,16 @@ def test_forward(self): def test_unk_support(self): t = self.tokenizer - ids = t.prepare_seq2seq_batch(["||"]).to(torch_device).input_ids[0].tolist() + ids = t.prepare_seq2seq_batch(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist() expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id] self.assertEqual(expected, ids) def test_pad_not_split(self): - input_ids_w_pad = self.tokenizer.prepare_seq2seq_batch(["I am a small frog "]).input_ids[0].tolist() + input_ids_w_pad = ( + self.tokenizer.prepare_seq2seq_batch(["I am a small frog "], return_tensors="pt") + .input_ids[0] + .tolist() + ) expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0] # pad self.assertListEqual(expected_w_pad, input_ids_w_pad) @@ -294,7 +300,7 @@ def test_tokenizer_handles_empty(self): normalized = self.tokenizer.normalize("") self.assertIsInstance(normalized, str) with self.assertRaises(ValueError): - self.tokenizer.prepare_seq2seq_batch([""]) + self.tokenizer.prepare_seq2seq_batch([""], return_tensors="pt") @slow def test_pipeline(self): diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py index 2b8da23338a532..8bb874613e9d64 100644 --- a/tests/test_modeling_mbart.py +++ b/tests/test_modeling_mbart.py @@ -92,7 +92,7 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest): @slow def test_enro_generate_one(self): batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch( - ["UN Chief Says There Is No Military Solution in Syria"] + ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="pt" ).to(torch_device) translated_tokens = self.model.generate(**batch) decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) @@ -101,7 +101,9 @@ def test_enro_generate_one(self): @slow def test_enro_generate_batch(self): - batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(self.src_text).to(torch_device) + batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(self.src_text, return_tensors="pt").to( + torch_device + ) translated_tokens = self.model.generate(**batch) decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) assert self.tgt_text == decoded @@ -153,7 +155,7 @@ class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest): @unittest.skip("This test is broken, still generates english") def test_cc25_generate(self): - inputs = self.tokenizer.prepare_seq2seq_batch([self.src_text[0]]).to(torch_device) + inputs = self.tokenizer.prepare_seq2seq_batch([self.src_text[0]], return_tensors="pt").to(torch_device) translated_tokens = self.model.generate( input_ids=inputs["input_ids"].to(torch_device), decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"], @@ -163,7 +165,9 @@ def test_cc25_generate(self): @slow def test_fill_mask(self): - inputs = self.tokenizer.prepare_seq2seq_batch(["One of the best I ever read!"]).to(torch_device) + inputs = self.tokenizer.prepare_seq2seq_batch(["One of the best I ever read!"], return_tensors="pt").to( + torch_device + ) outputs = self.model.generate( inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1 ) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 376616a0b5deaa..1bfd54c3fed885 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1794,7 +1794,7 @@ def test_prepare_seq2seq_batch(self): self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.labels.shape[1], 10) # max_target_length will default to max_length if not specified - batch = tokenizer.prepare_seq2seq_batch(src_text, tgt_texts=tgt_text, max_length=3) + batch = tokenizer.prepare_seq2seq_batch(src_text, tgt_texts=tgt_text, max_length=3, return_tensors="pt") self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.labels.shape[1], 3) diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index dd8d6e3f4fb344..f41925e0b91de3 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -165,7 +165,6 @@ def test_enro_tokenizer_truncation(self): desired_max_length = 10 ids = self.tokenizer.prepare_seq2seq_batch( src_text, - return_tensors=None, max_length=desired_max_length, ).input_ids[0] self.assertEqual(ids[-2], 2) @@ -203,9 +202,7 @@ def test_batch_fairseq_parity(self): @require_torch def test_enro_tokenizer_prepare_seq2seq_batch(self): batch = self.tokenizer.prepare_seq2seq_batch( - self.src_text, - tgt_texts=self.tgt_text, - max_length=len(self.expected_src_tokens), + self.src_text, tgt_texts=self.tgt_text, max_length=len(self.expected_src_tokens), return_tensors="pt" ) batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id) self.assertIsInstance(batch, BatchEncoding) @@ -221,13 +218,15 @@ def test_enro_tokenizer_prepare_seq2seq_batch(self): def test_seq2seq_max_target_length(self): batch = self.tokenizer.prepare_seq2seq_batch( - self.src_text, tgt_texts=self.tgt_text, max_length=3, max_target_length=10 + self.src_text, tgt_texts=self.tgt_text, max_length=3, max_target_length=10, return_tensors="pt" ) batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id) self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.decoder_input_ids.shape[1], 10) # max_target_length will default to max_length if not specified - batch = self.tokenizer.prepare_seq2seq_batch(self.src_text, tgt_texts=self.tgt_text, max_length=3) + batch = self.tokenizer.prepare_seq2seq_batch( + self.src_text, tgt_texts=self.tgt_text, max_length=3, return_tensors="pt" + ) batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id) self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.decoder_input_ids.shape[1], 3) diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index 6536220c32cabc..ad26075da69f67 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -61,7 +61,9 @@ def test_pegasus_large_tokenizer_settings(self): def test_pegasus_large_seq2seq_truncation(self): src_texts = ["This is going to be way too long." * 150, "short example"] tgt_texts = ["not super long but more than 5 tokens", "tiny"] - batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, max_target_length=5) + batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch( + src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt" + ) assert batch.input_ids.shape == (2, 1024) assert batch.attention_mask.shape == (2, 1024) assert "labels" in batch # because tgt_texts was specified From 82a62ea3fd69700950ee2ebbabce8ea1c3c9fecd Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 17 Nov 2020 12:07:23 -0500 Subject: [PATCH 2/5] Run the slow tests --- .github/workflows/self-push.yml | 8 ++++---- .github/workflows/self-scheduled.yml | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 4a0611fc5a2542..286dd231eff51f 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -16,7 +16,7 @@ on: jobs: run_tests_torch_gpu: - runs-on: [self-hosted, single-gpu] + runs-on: [self-hosted, gpu-tests, single-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -86,7 +86,7 @@ jobs: run_tests_tf_gpu: - runs-on: [self-hosted, single-gpu] + runs-on: [self-hosted, gpu-tests, single-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -154,7 +154,7 @@ jobs: path: reports run_tests_torch_multi_gpu: - runs-on: [self-hosted, multi-gpu] + runs-on: [self-hosted, gpu-tests, multi-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -213,7 +213,7 @@ jobs: path: reports run_tests_tf_multi_gpu: - runs-on: [self-hosted, multi-gpu] + runs-on: [self-hosted, gpu-tests, multi-gpu] steps: - uses: actions/checkout@v2 - name: Python version diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 54c126f39f8283..1daf08898958c1 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -9,13 +9,14 @@ on: push: branches: - ci_* + - framework-agnostic-tokenizers repository_dispatch: schedule: - cron: "0 0 * * *" jobs: run_all_tests_torch_gpu: - runs-on: [self-hosted, single-gpu] + runs-on: [self-hosted, gpu-tests, single-gpu] steps: - uses: actions/checkout@v2 @@ -109,7 +110,7 @@ jobs: run_all_tests_tf_gpu: - runs-on: [self-hosted, single-gpu] + runs-on: [self-hosted, gpu-tests, single-gpu] steps: - uses: actions/checkout@v2 @@ -188,7 +189,7 @@ jobs: path: reports run_all_tests_torch_multi_gpu: - runs-on: [self-hosted, multi-gpu] + runs-on: [self-hosted, gpu-tests, multi-gpu] steps: - uses: actions/checkout@v2 @@ -279,7 +280,7 @@ jobs: path: reports run_all_tests_tf_multi_gpu: - runs-on: [self-hosted, multi-gpu] + runs-on: [self-hosted, gpu-tests, multi-gpu] steps: - uses: actions/checkout@v2 From e2d9526be880c34c745e1397368aade9746d5796 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 17 Nov 2020 12:12:43 -0500 Subject: [PATCH 3/5] Not testing --- .github/workflows/self-push.yml | 8 ++++---- .github/workflows/self-scheduled.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 286dd231eff51f..0957f2f865cc75 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -16,7 +16,7 @@ on: jobs: run_tests_torch_gpu: - runs-on: [self-hosted, gpu-tests, single-gpu] + runs-on: [self-hosted, gpu, single-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -86,7 +86,7 @@ jobs: run_tests_tf_gpu: - runs-on: [self-hosted, gpu-tests, single-gpu] + runs-on: [self-hosted, gpu, single-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -154,7 +154,7 @@ jobs: path: reports run_tests_torch_multi_gpu: - runs-on: [self-hosted, gpu-tests, multi-gpu] + runs-on: [self-hosted, gpu, multi-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -213,7 +213,7 @@ jobs: path: reports run_tests_tf_multi_gpu: - runs-on: [self-hosted, gpu-tests, multi-gpu] + runs-on: [self-hosted, gpu, multi-gpu] steps: - uses: actions/checkout@v2 - name: Python version diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 1daf08898958c1..592733b5ba607d 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -16,7 +16,7 @@ on: jobs: run_all_tests_torch_gpu: - runs-on: [self-hosted, gpu-tests, single-gpu] + runs-on: [self-hosted, gpu, single-gpu] steps: - uses: actions/checkout@v2 @@ -110,7 +110,7 @@ jobs: run_all_tests_tf_gpu: - runs-on: [self-hosted, gpu-tests, single-gpu] + runs-on: [self-hosted, gpu, single-gpu] steps: - uses: actions/checkout@v2 @@ -189,7 +189,7 @@ jobs: path: reports run_all_tests_torch_multi_gpu: - runs-on: [self-hosted, gpu-tests, multi-gpu] + runs-on: [self-hosted, gpu, multi-gpu] steps: - uses: actions/checkout@v2 @@ -280,7 +280,7 @@ jobs: path: reports run_all_tests_tf_multi_gpu: - runs-on: [self-hosted, gpu-tests, multi-gpu] + runs-on: [self-hosted, gpu, multi-gpu] steps: - uses: actions/checkout@v2 From 996775386d0134977dd520202d6f473a5c451e7d Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 17 Nov 2020 12:18:34 -0500 Subject: [PATCH 4/5] Fix documentation --- .../models/bart/tokenization_bart.py | 65 ++---------------- .../models/bart/tokenization_bart_fast.py | 67 ++----------------- 2 files changed, 8 insertions(+), 124 deletions(-) diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 071492238ae5c0..6b46e30e9d527c 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -15,7 +15,9 @@ from typing import List, Optional -from ...tokenization_utils_base import BatchEncoding +from transformers import add_start_docstrings + +from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding from ...utils import logging from ..roberta.tokenization_roberta import RobertaTokenizer @@ -54,6 +56,7 @@ class BartTokenizer(RobertaTokenizer): "merges_file": {m: merges_url for m in _all_bart_models}, } + @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) def prepare_seq2seq_batch( self, src_texts: List[str], @@ -65,66 +68,6 @@ def prepare_seq2seq_batch( truncation=True, **kwargs, ) -> BatchEncoding: - r""" - - Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. - - Args: - src_texts: (:obj:`List[str]`): - List of documents to summarize or source language texts. - tgt_texts: (:obj:`List[str]`, `optional`): - List of summaries or target language texts. - max_length (:obj:`int`, `optional`): - Controls the maximum length for encoder inputs (documents to summarize or source language texts). If - left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length - is required by one of the truncation/padding parameters. If the model has no specific maximum input - length (like XLNet) truncation/padding to a maximum length will be deactivated. - max_target_length (:obj:`int`, `optional`): - Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or - set to :obj:`None`, this will use the max_length value. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): - Activates and controls padding. Accepts the following values: - - * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a - single sequence if provided). - * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - maximum acceptable input length for the model if that argument is not provided. - * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): - If set, will return tensors instead of list of python integers. Acceptable values are: - - * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. - * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. - * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. - truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): - Activates and controls truncation. Accepts the following values: - - * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument - :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not - provided. This will truncate token by token, removing a token from the longest sequence in the pair - if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to - the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or - to the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with - sequence lengths greater than the model maximum admissible input size). - **kwargs: - Additional keyword arguments passed along to :obj:`self.__call__`. - - Returns: - :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: - - - **input_ids** -- List of token ids to be fed to the encoder. - - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - - **labels** -- List of token ids for tgt_texts - - The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed. - Otherwise, input_ids, attention_mask will be the only keys. - """ kwargs.pop("src_lang", None) kwargs.pop("tgt_lang", None) if max_length is None: diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 43f226f3103217..21a70e51ece9dc 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -15,7 +15,9 @@ from typing import List, Optional -from ...tokenization_utils_base import BatchEncoding +from transformers import add_start_docstrings + +from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding from ...utils import logging from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast from .tokenization_bart import BartTokenizer @@ -49,6 +51,7 @@ class BartTokenizerFast(RobertaTokenizerFast): } slow_tokenizer_class = BartTokenizer + @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) def prepare_seq2seq_batch( self, src_texts: List[str], @@ -60,68 +63,6 @@ def prepare_seq2seq_batch( truncation=True, **kwargs, ) -> BatchEncoding: - r""" - - Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. - - Args: - src_texts: (:obj:`List[str]`): - List of documents to summarize or source language texts. - tgt_texts: (:obj:`List[str]`, `optional`): - List of summaries or target language texts. - max_length (:obj:`int`, `optional`): - Controls the maximum length for encoder inputs (documents to summarize or source language texts). If - left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length - is required by one of the truncation/padding parameters. If the model has no specific maximum input - length (like XLNet) truncation/padding to a maximum length will be deactivated. - max_target_length (:obj:`int`, `optional`): - Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or - set to :obj:`None`, this will use the max_length value. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): - Activates and controls padding. Accepts the following values: - - * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a - single sequence if provided). - * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - maximum acceptable input length for the model if that argument is not provided. - * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): - If set, will return tensors instead of list of python integers. Acceptable values are: - - * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. - * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. - * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. - truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): - Activates and controls truncation. Accepts the following values: - - * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument - :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not - provided. This will truncate token by token, removing a token from the longest sequence in the pair - if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to - the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or - to the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with - sequence lengths greater than the model maximum admissible input size). - **kwargs: - Additional keyword arguments passed along to :obj:`self.__call__`. - - Returns: - :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: - - - **input_ids** -- List of token ids to be fed to the encoder. - - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - - **decoder_input_ids** -- List of token ids to be fed to the decoder. - - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the - decoder. This does not include causal mask, which is built by the model. - - The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, will only - be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. - """ if max_length is None: max_length = self.model_max_length model_inputs: BatchEncoding = self( From 90555c2c8f4f5cade6d19b431443a91a31b83ea5 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 17 Nov 2020 14:02:40 -0500 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Patrick von Platen --- src/transformers/models/bart/tokenization_bart_fast.py | 2 +- src/transformers/models/fsmt/tokenization_fsmt.py | 2 +- src/transformers/models/marian/tokenization_marian.py | 2 +- src/transformers/models/mbart/tokenization_mbart.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 21a70e51ece9dc..30b77275f22169 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -59,7 +59,7 @@ def prepare_seq2seq_batch( max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", - return_tensors: str = None, + return_tensors: Optional[str] = None, truncation=True, **kwargs, ) -> BatchEncoding: diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index 0c11a7a64db9a8..71bfd93000f8ce 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -491,7 +491,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = None, + return_tensors: Optional[str] = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index cb81ddfe309aae..67b289db1fd84d 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -175,7 +175,7 @@ def prepare_seq2seq_batch( tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, - return_tensors: str = None, + return_tensors: Optional[str] = None, truncation=True, padding="longest", **unused, diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 98448fe66168ad..468d218ed37cbd 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -183,7 +183,7 @@ def prepare_seq2seq_batch( max_target_length: Optional[int] = None, truncation: bool = True, padding: str = "longest", - return_tensors: str = None, + return_tensors: Optional[str] = None, add_prefix_space: bool = False, # ignored **kwargs, ) -> BatchEncoding: