huggingface · younesbelkada · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
@@ -65,8 +65,11 @@ Let us assume your dataset has two fields, `question` and `answer`. Therefore yo
 ```python
 ...
 def formatting_prompts_func(example):
-    text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
-    return text
+    output_texts = []
+    for i in range(len(example['question'])):
+        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
+        output_texts.append(text)
+    return output_texts
 
 trainer = SFTTrainer(
     model,
@@ -76,6 +79,7 @@ trainer = SFTTrainer(
 
 trainer.train()
 ```
+To preperly format your input make sure to process all the examples by looping over them and returning a list of processed text. Check out a full example on how to use SFTTrainer on alpaca dataset [here](https://github.com/lvwerra/trl/pull/444#issue-1760952763)
 
 ### Packing dataset ([`ConstantLengthDataset`])
 

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -30,6 +30,14 @@ def formatting_prompts_func(example):
     return text
 
 
+def formatting_prompts_func_batched(example):
+    output_text = []
+    for i, question in enumerate(example["question"]):
+        text = f"### Question: {question}\n ### Answer: {example['answer'][i]}"
+        output_text.append(text)
+    return output_text
+
+
 if is_peft_available():
     from peft import LoraConfig, PeftModel
 
@@ -170,12 +178,22 @@ def test_sft_trainer_uncorrect_data(self):
                 packing=True,
             )
 
-            # This should work as well
+            # This should not work as well
+            with self.assertRaises(ValueError):
+                _ = SFTTrainer(
+                    model=self.model,
+                    args=training_args,
+                    train_dataset=self.dummy_dataset,
+                    formatting_func=formatting_prompts_func,
+                    packing=False,
+                )
+
+            # but this shpuld work
             _ = SFTTrainer(
                 model=self.model,
                 args=training_args,
                 train_dataset=self.dummy_dataset,
-                formatting_func=formatting_prompts_func,
+                formatting_func=formatting_prompts_func_batched,
                 packing=False,
             )
 
@@ -350,13 +368,6 @@ def test_sft_trainer_with_model(self):
                 per_device_train_batch_size=2,
             )
 
-            def formatting_prompts_func_batched(example):
-                output_text = []
-                for i, question in enumerate(example["question"]):
-                    text = f"### Question: {question}\n ### Answer: {example['answer'][i]}"
-                    output_text.append(text)
-                return output_text
-
             trainer = SFTTrainer(
                 model=self.model,
                 args=training_args,

diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -276,27 +276,43 @@ def _prepare_non_packed_dataloader(
         self, tokenizer, dataset, dataset_text_field, max_seq_len, formatting_func=None
     ):
         use_formatting_func = formatting_func is not None and dataset_text_field is None
+        self._dataset_sanity_checked = False
 
         # Inspired from: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
         def tokenize(element):
+            input_batch = []
+            attention_masks = []
+
             outputs = tokenizer(
                 element[dataset_text_field] if not use_formatting_func else formatting_func(element),
                 truncation=True,
+                padding=True,
                 max_length=max_seq_len,
                 return_overflowing_tokens=False,
                 return_length=True,
             )
-            input_batch = []
-            for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
+
+            if use_formatting_func and not self._dataset_sanity_checked:
+                if not isinstance(formatting_func(element), list):
+                    raise ValueError(
+                        "The `formatting_func` should return a list of processed strings since it can lead to silent bugs."
+                    )
+                else:
+                    self._dataset_sanity_checked = True
+
+            for length, input_ids, attention_mask in zip(
+                outputs["length"], outputs["input_ids"], outputs["attention_mask"]
+            ):
                 if length == max_seq_len:
                     input_batch.append(input_ids)
+                    attention_masks.append(attention_mask)
 
             if len(input_batch) == 0:
                 # warn users
                 warnings.warn(
                     f"Found 0 samples with a length of {max_seq_len}. You might want to decrease the `max_seq_len` argument."
                 )
-            return {"input_ids": input_batch}
+            return {"input_ids": input_batch, "attention_mask": attention_masks}
 
         tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)