From 4448ce6cde2b8387a037031ff7cd5ac4da02562b Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 3 Aug 2020 18:07:51 +0530 Subject: [PATCH 01/10] add prepare s2s batch --- src/transformers/tokenization_bart.py | 59 ++++++++++++++++++++ tests/test_modeling_bart.py | 77 ++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 499895e0bda666..aac75ca526386a 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -45,6 +45,65 @@ class BartTokenizer(RobertaTokenizer): "merges_file": {m: merges_url for m in _all_bart_models}, } + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + padding: str = "longest", + return_tensors: str = "None", + **kwargs, + ) -> BatchEncoding: + """Prepare a batch that can be passed directly to an instance of BartModel. + Args: + src_texts (:obj:`List[str]`): + list of src texts + tgt_texts (:obj:`List[str]`, `optional`): + list of tgt texts + max_length (:obj:`int`, `optional`): + maximum length for the source text which defers to the config value of 1024 for facebook/bart* + max_target_length (:obj:`int`, `optional`): + maximum length for the target text which defers to the config value of 1024 for facebook/bart* + padding (:obj:`str`, `optional`, defaults to "longest"): + strategy for padding `input_ids` and `decoder_input_ids`. Should be "max_length" or "longest". + return_tensors (:obj:`str`, `optional`): + Can be set to ‘tf’, ‘pt’ or ‘np’ to return respectively TensorFlow `tf.constant`, PyTorch `torch.Tensor` or Numpy :oj: np.ndarray instead of a list of python integers. + **kwargs: + passed to self.__call__ + Returns: + :class:`~transformers.BatchEncoding`: with keys input_ids, attention_mask, decoder_input_ids, decoder_attention_mask. + """ + if max_length is None: + max_length = self.model_max_length + model_inputs: BatchEncoding = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=True, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + decoder_inputs: BatchEncoding = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=True, + **kwargs, + ) + for k, v in decoder_inputs.items(): + model_inputs[f"decoder_{k}"] = v + + return model_inputs + class BartTokenizerFast(RobertaTokenizerFast): # merges and vocab same as Roberta diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index e86e46812e2ecd..d28e5fc3bc820c 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -18,7 +18,8 @@ import timeout_decorator # noqa -from transformers import is_torch_available +from transformers import BatchEncoding, is_torch_available +from transformers.file_utils import cached_property from transformers.testing_utils import require_torch, slow, torch_device from .test_configuration_common import ConfigTester @@ -415,6 +416,10 @@ def _long_tensor(tok_lst): @require_torch class BartModelIntegrationTests(unittest.TestCase): + @cached_property + def default_tokenizer(self): + return BartTokenizer.from_pretrained("facebook/bart-large") + @slow def test_inference_no_head(self): model = BartModel.from_pretrained("facebook/bart-large").to(torch_device) @@ -559,6 +564,76 @@ def test_cnn_summarization_same_as_fairseq(self): # TODO(SS): run fairseq again with num_beams=2, min_len=20. # TODO(SS): add test case that hits max_length + def test_prepare_seq2seq_batch(self): + tokenizer = self.default_tokenizer + src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + tgt_text = [ + "Summary of the text.", + "Another summary.", + ] + expected_src_tokens = [0, 250, 251, 17818, 13, 32933, 21645, 1258, 4, 2] + batch = tokenizer.prepare_seq2seq_batch( + src_text, tgt_texts=tgt_text, max_length=len(expected_src_tokens), return_tensors="pt" + ) + self.assertIsInstance(batch, BatchEncoding) + + self.assertEqual((2, 10), batch.input_ids.shape) + self.assertEqual((2, 10), batch.attention_mask.shape) + result = batch.input_ids.tolist()[0] + self.assertListEqual(expected_src_tokens, result) + # Test that special tokens are reset + + def test_empty_target_text(self): + tokenizer = self.default_tokenizer + src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + batch = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt") + # check if input_ids are returned and no decoder_input_ids + self.assertIn("input_ids", batch) + self.assertIn("attention_mask", batch) + self.assertNotIn("decoder_input_ids", batch) + self.assertNotIn("decoder_attention_mask", batch) + + def test_max_target_length(self): + tokenizer = self.default_tokenizer + src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + tgt_text = [ + "Summary of the text.", + "Another summary.", + ] + batch = tokenizer.prepare_seq2seq_batch( + src_text, tgt_texts=tgt_text, max_target_length=32, padding="max_length", return_tensors="pt" + ) + self.assertEqual(32, batch["decoder_input_ids"].shape[1]) + self.assertEqual(32, batch["decoder_attention_mask"].shape[1]) + + # test None max_target_length + batch = tokenizer.prepare_seq2seq_batch( + src_text, tgt_texts=tgt_text, max_length=32, padding="max_length", return_tensors="pt" + ) + self.assertEqual(32, batch["decoder_input_ids"].shape[1]) + self.assertEqual(32, batch["decoder_attention_mask"].shape[1]) + + def test_outputs_not_longer_than_maxlen(self): + tokenizer = self.default_tokenizer + + batch = tokenizer.prepare_seq2seq_batch(["I am a small frog" * 1024, "I am a small frog"], return_tensors="pt") + self.assertIsInstance(batch, BatchEncoding) + self.assertEqual(batch.input_ids.shape, (2, 1024)) + + def test_special_tokens(self): + tokenizer = self.default_tokenizer + src_text = ["A long paragraph for summrization."] + tgt_text = [ + "Summary of the text.", + ] + batch = tokenizer.prepare_seq2seq_batch(src_text, tgt_texts=tgt_text, return_tensors="pt") + input_ids = batch["input_ids"] + decoder_input_ids = batch["decoder_input_ids"] + self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) + self.assertTrue((decoder_input_ids[:, 0] == tokenizer.bos_token_id).all().item()) + self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) + self.assertTrue((decoder_input_ids[:, -1] == tokenizer.eos_token_id).all().item()) + @require_torch class TestSinusoidalPositionalEmbeddings(unittest.TestCase): From baf4018c842032ce969b49c5f35a213d564f91f0 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 3 Aug 2020 21:31:38 +0530 Subject: [PATCH 02/10] doc nit --- src/transformers/tokenization_bart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index aac75ca526386a..270dde255462d6 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -55,7 +55,7 @@ def prepare_seq2seq_batch( return_tensors: str = "None", **kwargs, ) -> BatchEncoding: - """Prepare a batch that can be passed directly to an instance of BartModel. + """Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. Args: src_texts (:obj:`List[str]`): list of src texts From 0301dfd0229a9caea4b9c1ac26803d40011e6d88 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 3 Aug 2020 21:42:40 +0530 Subject: [PATCH 03/10] better docs --- src/transformers/tokenization_bart.py | 30 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 270dde255462d6..df8335993ad1fc 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -58,19 +58,31 @@ def prepare_seq2seq_batch( """Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. Args: src_texts (:obj:`List[str]`): - list of src texts + List of input texts. tgt_texts (:obj:`List[str]`, `optional`): - list of tgt texts + List of target texts. max_length (:obj:`int`, `optional`): - maximum length for the source text which defers to the config value of 1024 for facebook/bart* + Maximum length for the source texts. If not provided, this will use the predefined model maximum length. max_target_length (:obj:`int`, `optional`): - maximum length for the target text which defers to the config value of 1024 for facebook/bart* - padding (:obj:`str`, `optional`, defaults to "longest"): - strategy for padding `input_ids` and `decoder_input_ids`. Should be "max_length" or "longest". - return_tensors (:obj:`str`, `optional`): - Can be set to ‘tf’, ‘pt’ or ‘np’ to return respectively TensorFlow `tf.constant`, PyTorch `torch.Tensor` or Numpy :oj: np.ndarray instead of a list of python integers. + Maximum length for the target texts. If not provided, this will use the predefined model maximum length. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. **kwargs: - passed to self.__call__ + Additional keyword arguments passed along to :obj:`self.__call__`. Returns: :class:`~transformers.BatchEncoding`: with keys input_ids, attention_mask, decoder_input_ids, decoder_attention_mask. """ From 24736ddd72ce562026fdc004be8d69ba2235e69a Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 3 Aug 2020 21:55:10 +0530 Subject: [PATCH 04/10] fix doc indent --- src/transformers/tokenization_bart.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index df8335993ad1fc..b01b6f6fb253ca 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -66,8 +66,8 @@ def prepare_seq2seq_batch( max_target_length (:obj:`int`, `optional`): Maximum length for the target texts. If not provided, this will use the predefined model maximum length. padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). From ad1b9188792aa5210f9bb4588760dcefa4f2d41b Mon Sep 17 00:00:00 2001 From: sgugger Date: Mon, 3 Aug 2020 13:18:16 -0400 Subject: [PATCH 05/10] Fix indentation --- src/transformers/tokenization_bart.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index b01b6f6fb253ca..238645b238ddec 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -55,7 +55,9 @@ def prepare_seq2seq_batch( return_tensors: str = "None", **kwargs, ) -> BatchEncoding: - """Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. + r""" + Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. + Args: src_texts (:obj:`List[str]`): List of input texts. From f0c3039eda14af0fa5715bcc06e043dca53ce049 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Thu, 13 Aug 2020 19:36:46 +0530 Subject: [PATCH 06/10] better doc --- src/transformers/tokenization_bart.py | 52 +++++++++++++++++++-------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index b01b6f6fb253ca..1fb03ce22ce923 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -53,38 +53,62 @@ def prepare_seq2seq_batch( max_target_length: Optional[int] = None, padding: str = "longest", return_tensors: str = "None", + truncation=True, **kwargs, ) -> BatchEncoding: """Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. Args: - src_texts (:obj:`List[str]`): - List of input texts. - tgt_texts (:obj:`List[str]`, `optional`): - List of target texts. + rc_texts: (:obj:`list`): + list of documents to summarize or source language texts + tgt_texts: (:obj:`list`, `optional`): + list of tgt language texts or summaries. max_length (:obj:`int`, `optional`): - Maximum length for the source texts. If not provided, this will use the predefined model maximum length. + Controls the maximum length for encoder inputs (documents to summarize or source language texts) + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. max_target_length (:obj:`int`, `optional`): - Maximum length for the target texts. If not provided, this will use the predefined model maximum length. + Controls the maximum length of decoder inputs (target language texts or summaries) + If left unset or set to :obj:`None`, this will use the max_length value. padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - + Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): If set, will return tensors instead of list of python integers. Acceptable values are: - * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): + Activates and controls truncation. Accepts the following values: + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). **kwargs: Additional keyword arguments passed along to :obj:`self.__call__`. Returns: - :class:`~transformers.BatchEncoding`: with keys input_ids, attention_mask, decoder_input_ids, decoder_attention_mask. + :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + - **input_ids** -- List of token ids to be fed to the encoder. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. + - **decoder_input_ids** -- List of token ids to be fed to the decoder. + - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder. + This does not include causal mask, which is built by the model. + + The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, + will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. """ if max_length is None: max_length = self.model_max_length @@ -94,7 +118,7 @@ def prepare_seq2seq_batch( return_tensors=return_tensors, max_length=max_length, padding=padding, - truncation=True, + truncation=truncation, **kwargs, ) if tgt_texts is None: @@ -108,7 +132,7 @@ def prepare_seq2seq_batch( return_tensors=return_tensors, padding=padding, max_length=max_target_length, - truncation=True, + truncation=truncation, **kwargs, ) for k, v in decoder_inputs.items(): From 7084cf0288817a415f0029891ef1f6105c33984d Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Thu, 13 Aug 2020 19:38:00 +0530 Subject: [PATCH 07/10] Merge branch 'bart-tok-s2s-batch' of https://github.com/patil-suraj/transformers into bart-tok-s2s-batch --- src/transformers/tokenization_bart.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 1fb03ce22ce923..73aa9e1232b7ed 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -56,7 +56,9 @@ def prepare_seq2seq_batch( truncation=True, **kwargs, ) -> BatchEncoding: - """Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. + r""" + Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. + Args: rc_texts: (:obj:`list`): list of documents to summarize or source language texts From fdf93d908d63c38d40d25331436cd3b4d51c9936 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Thu, 13 Aug 2020 19:39:45 +0530 Subject: [PATCH 08/10] typo --- src/transformers/tokenization_bart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 73aa9e1232b7ed..1e22f4b9262818 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -60,7 +60,7 @@ def prepare_seq2seq_batch( Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. Args: - rc_texts: (:obj:`list`): + src_texts: (:obj:`list`): list of documents to summarize or source language texts tgt_texts: (:obj:`list`, `optional`): list of tgt language texts or summaries. From ebd603e6fd8dc770b8001d5d7a0a0126b0affd16 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Thu, 13 Aug 2020 20:26:55 +0530 Subject: [PATCH 09/10] fix docs --- src/transformers/tokenization_bart.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 1e22f4b9262818..e1d4ea513fe73a 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -57,20 +57,21 @@ def prepare_seq2seq_batch( **kwargs, ) -> BatchEncoding: r""" + Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. Args: - src_texts: (:obj:`list`): - list of documents to summarize or source language texts - tgt_texts: (:obj:`list`, `optional`): - list of tgt language texts or summaries. + src_texts: (:obj:`List[str]`): + List of documents to summarize or source language texts. + tgt_texts: (:obj:`List[str]`, `optional`): + List of summaries or target language texts. max_length (:obj:`int`, `optional`): - Controls the maximum length for encoder inputs (documents to summarize or source language texts) + Controls the maximum length for encoder inputs (documents to summarize or source language texts). If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. max_target_length (:obj:`int`, `optional`): - Controls the maximum length of decoder inputs (target language texts or summaries) + Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or set to :obj:`None`, this will use the max_length value. padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: @@ -101,6 +102,7 @@ def prepare_seq2seq_batch( sequence lengths greater than the model maximum admissible input size). **kwargs: Additional keyword arguments passed along to :obj:`self.__call__`. + Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: - **input_ids** -- List of token ids to be fed to the encoder. From b05ee8f033d46a8c21bea84be4aea635dfded3ce Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Fri, 14 Aug 2020 20:24:04 +0530 Subject: [PATCH 10/10] fix docs --- src/transformers/tokenization_bart.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index e1d4ea513fe73a..bf456be302135f 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -75,6 +75,7 @@ def prepare_seq2seq_batch( If left unset or set to :obj:`None`, this will use the max_length value. padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the @@ -83,11 +84,13 @@ def prepare_seq2seq_batch( different lengths). return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): If set, will return tensors instead of list of python integers. Acceptable values are: + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): Activates and controls truncation. Accepts the following values: + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair @@ -105,6 +108,7 @@ def prepare_seq2seq_batch( Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + - **input_ids** -- List of token ids to be fed to the encoder. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - **decoder_input_ids** -- List of token ids to be fed to the decoder.