huggingface · qgallouedec · Oct 4, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/commands/run_sft.sh b/commands/run_sft.sh
@@ -41,7 +41,6 @@ accelerate launch $EXTRA_ACCELERATE_ARGS \
     --dataset_name $DATASET_NAME \
     --output_dir $OUTPUT_DIR \
     --max_steps $MAX_STEPS \
-    --dataset_text_field 'text' \
     --per_device_train_batch_size $BATCH_SIZE \
     --max_seq_length $SEQ_LEN \
     $EXTRA_TRAINING_ARGS

diff --git a/docs/source/clis.mdx b/docs/source/clis.mdx
@@ -26,8 +26,6 @@ model_name_or_path:
   trl-internal-testing/tiny-random-LlamaForCausalLM
 dataset_name:
   stanfordnlp/imdb
-dataset_text_field:
-  text
 report_to:
   none
 learning_rate:

diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
@@ -17,7 +17,6 @@ from trl import SFTConfig, SFTTrainer
 dataset = load_dataset("stanfordnlp/imdb", split="train")
 
 training_args = SFTConfig(
-    dataset_text_field="text",
     max_seq_length=512,
     output_dir="/tmp",
 )
@@ -110,10 +109,7 @@ collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_temp
 
 trainer = SFTTrainer(
     model,
-    args=SFTConfig(
-        output_dir="/tmp",
-        dataset_text_field = "text",
-    ),
+    args=SFTConfig(output_dir="/tmp"),
     train_dataset=dataset,
     data_collator=collator,
 )
@@ -271,7 +267,7 @@ To properly format your input make sure to process all the examples by looping o
 
 ```python
 ...
-training_args = SFTConfig(packing=True, dataset_text_field="text",)
+training_args = SFTConfig(packing=True)
 
 trainer = SFTTrainer(
     "facebook/opt-350m",
@@ -578,11 +574,7 @@ model = FastLanguageModel.get_peft_model(
     random_state=3407,
 )
 
-training_args = SFTConfig(
-    output_dir="./output",
-    max_seq_length=max_seq_length,
-    dataset_text_field="text",
-)
+training_args = SFTConfig(output_dir="./output", max_seq_length=max_seq_length)
 
 trainer = SFTTrainer(
     model=model,
@@ -742,7 +734,6 @@ print(collated_data.keys())  # dict_keys(['input_ids', 'attention_mask', 'pixel_
 Now that we have prepared the data and defined the collator, we can proceed with training the model. To ensure that the data is not processed as text-only, we need to set a couple of arguments in the `SFTConfig`, specifically `dataset_text_field` and `remove_unused_columns`. We also need to set `skip_prepare_dataset` to `True` to avoid the default processing of the dataset. Below is an example of how to set up the `SFTTrainer`.
 
 ```python
-training_args.dataset_text_field = ""  # needs a dummy field
 training_args.remove_unused_columns = False
 training_args.dataset_kwargs = {"skip_prepare_dataset": True}
 

diff --git a/examples/cli_configs/example_config.yaml b/examples/cli_configs/example_config.yaml
@@ -10,8 +10,6 @@ model_name_or_path:
   trl-internal-testing/tiny-random-LlamaForCausalLM
 dataset_name:
   stanfordnlp/imdb
-dataset_text_field:
-  text
 report_to:
   none
 learning_rate:

diff --git a/examples/scripts/sft.py b/examples/scripts/sft.py
@@ -15,7 +15,6 @@
 # regular:
 python examples/scripts/sft.py \
     --model_name_or_path="facebook/opt-350m" \
-    --dataset_text_field="text" \
     --report_to="wandb" \
     --learning_rate=1.41e-5 \
     --per_device_train_batch_size=64 \
@@ -30,7 +29,6 @@
 # peft:
 python examples/scripts/sft.py \
     --model_name_or_path="facebook/opt-350m" \
-    --dataset_text_field="text" \
     --report_to="wandb" \
     --learning_rate=1.41e-5 \
     --per_device_train_batch_size=64 \

diff --git a/examples/scripts/sft_vlm.py b/examples/scripts/sft_vlm.py
@@ -54,7 +54,6 @@
     parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig))
     script_args, training_args, model_config = parser.parse_args_and_config()
     training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
-    training_args.dataset_text_field = ""  # need a dummy field
     training_args.remove_unused_columns = False
     training_args.dataset_kwargs = {"skip_prepare_dataset": True}
 

diff --git a/tests/slow/test_sft_slow.py b/tests/slow/test_sft_slow.py
@@ -45,7 +45,6 @@ class SFTTrainerSlowTester(unittest.TestCase):
     def setUp(self):
         self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]")
         self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]")
-        self.dataset_text_field = "text"
         self.max_seq_length = 128
         self.peft_config = LoraConfig(
             lora_alpha=16,
@@ -74,7 +73,6 @@ def test_sft_trainer_str(self, model_name, packing):
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
             )
 
@@ -101,7 +99,6 @@ def test_sft_trainer_transformers(self, model_name, packing):
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
             )
 
@@ -136,7 +133,6 @@ def test_sft_trainer_peft(self, model_name, packing):
                 max_steps=10,
                 fp16=True,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
             )
 
@@ -173,7 +169,6 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
                 max_steps=10,
                 fp16=True,  # this is sufficient to enable amp
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
             )
 
@@ -206,7 +201,6 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
                 fp16=True,  # this is sufficient to enable amp
                 gradient_checkpointing=True,
@@ -243,7 +237,6 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
                 fp16=True,  # this is sufficient to enable amp
                 gradient_checkpointing=True,
@@ -287,7 +280,6 @@ def test_sft_trainer_transformers_mp_gc_device_map(
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
                 fp16=True,  # this is sufficient to enable amp
                 gradient_checkpointing=True,
@@ -325,7 +317,6 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
                 per_device_train_batch_size=2,
                 max_steps=10,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
                 fp16=True,  # this is sufficient to enable amp
                 gradient_checkpointing=True,
@@ -410,7 +401,6 @@ def test_sft_trainer_with_liger(self, model_name, packing):
                 per_device_train_batch_size=2,
                 max_steps=2,
                 packing=packing,
-                dataset_text_field=self.dataset_text_field,
                 max_seq_length=self.max_seq_length,
                 use_liger=True,
             )

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -20,7 +20,7 @@
 def test_sft_cli():
     try:
         subprocess.run(
-            "trl sft --max_steps 1 --output_dir tmp-sft --model_name_or_path trl-internal-testing/tiny-random-LlamaForCausalLM --dataset_name stanfordnlp/imdb --learning_rate 1e-4 --lr_scheduler_type cosine --dataset_text_field text",
+            "trl sft --max_steps 1 --output_dir tmp-sft --model_name_or_path trl-internal-testing/tiny-random-LlamaForCausalLM --dataset_name stanfordnlp/imdb --learning_rate 1e-4 --lr_scheduler_type cosine",
             shell=True,
             check=True,
         )

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -146,7 +146,6 @@ def setUp(self):
         self.train_dataset = ConstantLengthDataset(
             self.tokenizer,
             self.dummy_dataset,
-            dataset_text_field=None,
             formatting_func=formatting_prompts_func,
             seq_length=16,
             num_of_sequences=16,
@@ -155,7 +154,6 @@ def setUp(self):
         self.eval_dataset = ConstantLengthDataset(
             self.tokenizer,
             self.dummy_dataset,
-            dataset_text_field=None,
             formatting_func=formatting_prompts_func,
             seq_length=16,
             num_of_sequences=16,
@@ -165,7 +163,6 @@ def test_constant_length_dataset(self):
         formatted_dataset = ConstantLengthDataset(
             self.tokenizer,
             self.dummy_dataset,
-            dataset_text_field=None,
             formatting_func=formatting_prompts_func,
         )
 
@@ -242,24 +239,6 @@ def test_sft_trainer(self):
 
     def test_sft_trainer_uncorrect_data(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
-            # Shouldn't work as `dataset_text_field` is missing from the arguments
-            training_args = SFTConfig(
-                output_dir=tmp_dir,
-                dataloader_drop_last=True,
-                max_steps=2,
-                eval_steps=1,
-                save_steps=1,
-                per_device_train_batch_size=2,
-                packing=True,
-                report_to="none",
-            )
-            with pytest.raises(ValueError):
-                _ = SFTTrainer(
-                    model=self.model,
-                    args=training_args,
-                    train_dataset=self.dummy_dataset,
-                )
-
             # Shoud work as SFTTrainer natively supports conversational lm dataset
             training_args = SFTConfig(
                 output_dir=tmp_dir,
@@ -443,7 +422,6 @@ def test_sft_trainer_with_model_num_train_epochs(self):
                 save_steps=1,
                 num_train_epochs=2,
                 per_device_train_batch_size=2,
-                dataset_text_field="text",
                 max_seq_length=16,
                 num_of_sequences=16,
                 packing=True,
@@ -469,7 +447,6 @@ def test_sft_trainer_with_model_num_train_epochs(self):
                 save_steps=1,
                 num_train_epochs=2,
                 per_device_train_batch_size=2,
-                dataset_text_field="text",
                 max_seq_length=16,
                 report_to="none",
             )
@@ -519,7 +496,6 @@ def test_sft_trainer_with_model(self):
                 max_steps=2,
                 save_steps=1,
                 per_device_train_batch_size=2,
-                dataset_text_field="text",
                 max_seq_length=16,
                 num_of_sequences=16,
                 packing=True,
@@ -594,7 +570,6 @@ def test_sft_trainer_with_model(self):
                 max_steps=2,
                 save_steps=1,
                 per_device_train_batch_size=2,
-                dataset_text_field="text",
                 max_seq_length=16,
                 report_to="none",
             )
@@ -1159,7 +1134,6 @@ def test_sft_trainer_skip_prepare_dataset(self):
                 per_device_train_batch_size=2,
                 gradient_checkpointing=True,
                 remove_unused_columns=False,
-                dataset_text_field="text",  # need a dummy field
                 dataset_kwargs={"skip_prepare_dataset": True},
                 report_to="none",
             )
@@ -1209,7 +1183,6 @@ def test_sft_trainer_llava(self):
                 per_device_train_batch_size=2,
                 per_device_eval_batch_size=2,
                 remove_unused_columns=False,
-                dataset_text_field="",  # need a dummy field
                 dataset_kwargs={"skip_prepare_dataset": True},
                 report_to="none",
             )

diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
@@ -27,7 +27,7 @@ class SFTConfig(TrainingArguments):
     command line.
 
     Parameters:
-        dataset_text_field (`Optional[str]`, *optional*, defaults to `None`):
+        dataset_text_field (`str`, *optional*, defaults to `"text"`):
             Name of the text field of the dataset. If provided, the trainer will automatically create a
             [`ConstantLengthDataset`] based on `dataset_text_field`.
         packing (`bool`, *optional*, defaults to `False`):
@@ -56,7 +56,7 @@ class SFTConfig(TrainingArguments):
             Monkey patch the model with Liger kernels to increase throughput and reduce memory usage.
     """
 
-    dataset_text_field: Optional[str] = None
+    dataset_text_field: str = "text"
     packing: bool = False
     max_seq_length: Optional[int] = None
     dataset_num_proc: Optional[int] = None