Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Default dataset_text_field to "text" #2078

Merged
merged 14 commits into from
Oct 4, 2024
Merged
1 change: 0 additions & 1 deletion commands/run_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ accelerate launch $EXTRA_ACCELERATE_ARGS \
--dataset_name $DATASET_NAME \
--output_dir $OUTPUT_DIR \
--max_steps $MAX_STEPS \
--dataset_text_field 'text' \
--per_device_train_batch_size $BATCH_SIZE \
--max_seq_length $SEQ_LEN \
$EXTRA_TRAINING_ARGS
Expand Down
2 changes: 0 additions & 2 deletions docs/source/clis.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ model_name_or_path:
trl-internal-testing/tiny-random-LlamaForCausalLM
dataset_name:
stanfordnlp/imdb
dataset_text_field:
text
report_to:
none
learning_rate:
Expand Down
15 changes: 3 additions & 12 deletions docs/source/sft_trainer.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ from trl import SFTConfig, SFTTrainer
dataset = load_dataset("stanfordnlp/imdb", split="train")

training_args = SFTConfig(
dataset_text_field="text",
max_seq_length=512,
output_dir="/tmp",
)
Expand Down Expand Up @@ -110,10 +109,7 @@ collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_temp

trainer = SFTTrainer(
model,
args=SFTConfig(
output_dir="/tmp",
dataset_text_field = "text",
),
args=SFTConfig(output_dir="/tmp"),
train_dataset=dataset,
data_collator=collator,
)
Expand Down Expand Up @@ -271,7 +267,7 @@ To properly format your input make sure to process all the examples by looping o

```python
...
training_args = SFTConfig(packing=True, dataset_text_field="text",)
training_args = SFTConfig(packing=True)

trainer = SFTTrainer(
"facebook/opt-350m",
Expand Down Expand Up @@ -578,11 +574,7 @@ model = FastLanguageModel.get_peft_model(
random_state=3407,
)

training_args = SFTConfig(
output_dir="./output",
max_seq_length=max_seq_length,
dataset_text_field="text",
)
training_args = SFTConfig(output_dir="./output", max_seq_length=max_seq_length)

trainer = SFTTrainer(
model=model,
Expand Down Expand Up @@ -742,7 +734,6 @@ print(collated_data.keys()) # dict_keys(['input_ids', 'attention_mask', 'pixel_
Now that we have prepared the data and defined the collator, we can proceed with training the model. To ensure that the data is not processed as text-only, we need to set a couple of arguments in the `SFTConfig`, specifically `dataset_text_field` and `remove_unused_columns`. We also need to set `skip_prepare_dataset` to `True` to avoid the default processing of the dataset. Below is an example of how to set up the `SFTTrainer`.

```python
training_args.dataset_text_field = "" # needs a dummy field
training_args.remove_unused_columns = False
training_args.dataset_kwargs = {"skip_prepare_dataset": True}

Expand Down
2 changes: 0 additions & 2 deletions examples/cli_configs/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ model_name_or_path:
trl-internal-testing/tiny-random-LlamaForCausalLM
dataset_name:
stanfordnlp/imdb
dataset_text_field:
text
report_to:
none
learning_rate:
Expand Down
2 changes: 0 additions & 2 deletions examples/scripts/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# regular:
python examples/scripts/sft.py \
--model_name_or_path="facebook/opt-350m" \
--dataset_text_field="text" \
--report_to="wandb" \
--learning_rate=1.41e-5 \
--per_device_train_batch_size=64 \
Expand All @@ -30,7 +29,6 @@
# peft:
python examples/scripts/sft.py \
--model_name_or_path="facebook/opt-350m" \
--dataset_text_field="text" \
--report_to="wandb" \
--learning_rate=1.41e-5 \
--per_device_train_batch_size=64 \
Expand Down
1 change: 0 additions & 1 deletion examples/scripts/sft_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig))
script_args, training_args, model_config = parser.parse_args_and_config()
training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
training_args.dataset_text_field = "" # need a dummy field
training_args.remove_unused_columns = False
training_args.dataset_kwargs = {"skip_prepare_dataset": True}

Expand Down
10 changes: 0 additions & 10 deletions tests/slow/test_sft_slow.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class SFTTrainerSlowTester(unittest.TestCase):
def setUp(self):
self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]")
self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]")
self.dataset_text_field = "text"
self.max_seq_length = 128
self.peft_config = LoraConfig(
lora_alpha=16,
Expand Down Expand Up @@ -74,7 +73,6 @@ def test_sft_trainer_str(self, model_name, packing):
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
)

Expand All @@ -101,7 +99,6 @@ def test_sft_trainer_transformers(self, model_name, packing):
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
)

Expand Down Expand Up @@ -136,7 +133,6 @@ def test_sft_trainer_peft(self, model_name, packing):
max_steps=10,
fp16=True,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
)

Expand Down Expand Up @@ -173,7 +169,6 @@ def test_sft_trainer_transformers_mp(self, model_name, packing):
max_steps=10,
fp16=True, # this is sufficient to enable amp
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
)

Expand Down Expand Up @@ -206,7 +201,6 @@ def test_sft_trainer_transformers_mp_gc(self, model_name, packing, gradient_chec
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
fp16=True, # this is sufficient to enable amp
gradient_checkpointing=True,
Expand Down Expand Up @@ -243,7 +237,6 @@ def test_sft_trainer_transformers_mp_gc_peft(self, model_name, packing, gradient
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
fp16=True, # this is sufficient to enable amp
gradient_checkpointing=True,
Expand Down Expand Up @@ -287,7 +280,6 @@ def test_sft_trainer_transformers_mp_gc_device_map(
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
fp16=True, # this is sufficient to enable amp
gradient_checkpointing=True,
Expand Down Expand Up @@ -325,7 +317,6 @@ def test_sft_trainer_transformers_mp_gc_peft_qlora(self, model_name, packing, gr
per_device_train_batch_size=2,
max_steps=10,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
fp16=True, # this is sufficient to enable amp
gradient_checkpointing=True,
Expand Down Expand Up @@ -410,7 +401,6 @@ def test_sft_trainer_with_liger(self, model_name, packing):
per_device_train_batch_size=2,
max_steps=2,
packing=packing,
dataset_text_field=self.dataset_text_field,
max_seq_length=self.max_seq_length,
use_liger=True,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
def test_sft_cli():
try:
subprocess.run(
"trl sft --max_steps 1 --output_dir tmp-sft --model_name_or_path trl-internal-testing/tiny-random-LlamaForCausalLM --dataset_name stanfordnlp/imdb --learning_rate 1e-4 --lr_scheduler_type cosine --dataset_text_field text",
"trl sft --max_steps 1 --output_dir tmp-sft --model_name_or_path trl-internal-testing/tiny-random-LlamaForCausalLM --dataset_name stanfordnlp/imdb --learning_rate 1e-4 --lr_scheduler_type cosine",
shell=True,
check=True,
)
Expand Down
27 changes: 0 additions & 27 deletions tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def setUp(self):
self.train_dataset = ConstantLengthDataset(
self.tokenizer,
self.dummy_dataset,
dataset_text_field=None,
formatting_func=formatting_prompts_func,
seq_length=16,
num_of_sequences=16,
Expand All @@ -155,7 +154,6 @@ def setUp(self):
self.eval_dataset = ConstantLengthDataset(
self.tokenizer,
self.dummy_dataset,
dataset_text_field=None,
formatting_func=formatting_prompts_func,
seq_length=16,
num_of_sequences=16,
Expand All @@ -165,7 +163,6 @@ def test_constant_length_dataset(self):
formatted_dataset = ConstantLengthDataset(
self.tokenizer,
self.dummy_dataset,
dataset_text_field=None,
formatting_func=formatting_prompts_func,
)

Expand Down Expand Up @@ -242,24 +239,6 @@ def test_sft_trainer(self):

def test_sft_trainer_uncorrect_data(self):
with tempfile.TemporaryDirectory() as tmp_dir:
# Shouldn't work as `dataset_text_field` is missing from the arguments
training_args = SFTConfig(
output_dir=tmp_dir,
dataloader_drop_last=True,
max_steps=2,
eval_steps=1,
save_steps=1,
per_device_train_batch_size=2,
packing=True,
report_to="none",
)
with pytest.raises(ValueError):
_ = SFTTrainer(
model=self.model,
args=training_args,
train_dataset=self.dummy_dataset,
)

# Shoud work as SFTTrainer natively supports conversational lm dataset
training_args = SFTConfig(
output_dir=tmp_dir,
Expand Down Expand Up @@ -443,7 +422,6 @@ def test_sft_trainer_with_model_num_train_epochs(self):
save_steps=1,
num_train_epochs=2,
per_device_train_batch_size=2,
dataset_text_field="text",
max_seq_length=16,
num_of_sequences=16,
packing=True,
Expand All @@ -469,7 +447,6 @@ def test_sft_trainer_with_model_num_train_epochs(self):
save_steps=1,
num_train_epochs=2,
per_device_train_batch_size=2,
dataset_text_field="text",
max_seq_length=16,
report_to="none",
)
Expand Down Expand Up @@ -519,7 +496,6 @@ def test_sft_trainer_with_model(self):
max_steps=2,
save_steps=1,
per_device_train_batch_size=2,
dataset_text_field="text",
max_seq_length=16,
num_of_sequences=16,
packing=True,
Expand Down Expand Up @@ -594,7 +570,6 @@ def test_sft_trainer_with_model(self):
max_steps=2,
save_steps=1,
per_device_train_batch_size=2,
dataset_text_field="text",
max_seq_length=16,
report_to="none",
)
Expand Down Expand Up @@ -1159,7 +1134,6 @@ def test_sft_trainer_skip_prepare_dataset(self):
per_device_train_batch_size=2,
gradient_checkpointing=True,
remove_unused_columns=False,
dataset_text_field="text", # need a dummy field
dataset_kwargs={"skip_prepare_dataset": True},
report_to="none",
)
Expand Down Expand Up @@ -1209,7 +1183,6 @@ def test_sft_trainer_llava(self):
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
remove_unused_columns=False,
dataset_text_field="", # need a dummy field
dataset_kwargs={"skip_prepare_dataset": True},
report_to="none",
)
Expand Down
4 changes: 2 additions & 2 deletions trl/trainer/sft_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class SFTConfig(TrainingArguments):
command line.

Parameters:
dataset_text_field (`Optional[str]`, *optional*, defaults to `None`):
dataset_text_field (`str`, *optional*, defaults to `"text"`):
Name of the text field of the dataset. If provided, the trainer will automatically create a
[`ConstantLengthDataset`] based on `dataset_text_field`.
packing (`bool`, *optional*, defaults to `False`):
Expand Down Expand Up @@ -56,7 +56,7 @@ class SFTConfig(TrainingArguments):
Monkey patch the model with Liger kernels to increase throughput and reduce memory usage.
"""

dataset_text_field: Optional[str] = None
dataset_text_field: str = "text"
packing: bool = False
max_seq_length: Optional[int] = None
dataset_num_proc: Optional[int] = None
Expand Down
Loading
Loading