docs: Address review feedback on PEFT integration guide

behroozazarkhalili · behroozazarkhalili · commit 7ff168e1e299 · 2025-11-03T11:57:41.000-08:00
Applied all requested changes from PR review:
1. Added notebook reference link to example SFT LoRA/QLoRA notebook
2. Implemented hfoptions tabs to organize SFT/DPO/GRPO examples
3. Simplified Python code examples by removing non-PEFT boilerplate

The documentation now focuses more clearly on PEFT-specific configuration
while maintaining all essential information.
diff --git a/docs/source/peft_integration.md b/docs/source/peft_integration.md
@@ -4,6 +4,8 @@ TRL supports [PEFT](https://github.com/huggingface/peft) (Parameter-Efficient Fi
 
 This guide covers how to use PEFT with different TRL trainers, including LoRA, QLoRA, and prompt tuning techniques.
 
+For a complete working example, see the [SFT with LoRA/QLoRA notebook](https://github.com/huggingface/trl/blob/main/examples/notebooks/sft_trl_lora_qlora.ipynb).
+
 ## Installation
 
 To use PEFT with TRL, install the required dependencies:
@@ -60,6 +62,9 @@ trainer = SFTTrainer(
 
 TRL's trainers support PEFT configurations for various training paradigms. Below are detailed examples for each major trainer.
 
+<hfoptions id="trainer-type">
+<hfoption id="sft">
+
 ### Supervised Fine-Tuning (SFT)
 
 The `SFTTrainer` is used for supervised fine-tuning on instruction datasets.
@@ -96,18 +101,9 @@ python trl/scripts/sft.py \
 #### Python Example
 
 ```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import LoraConfig
 from trl import SFTConfig, SFTTrainer
 
-# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
-
-# Load dataset
-dataset = load_dataset("trl-lib/Capybara", split="train")
-
 # Configure LoRA
 peft_config = LoraConfig(
     r=32,
@@ -118,26 +114,20 @@ peft_config = LoraConfig(
     target_modules=["q_proj", "v_proj"],  # Optional: specify target modules
 )
 
-# Training arguments
-training_args = SFTConfig(
-    output_dir="./Qwen2-0.5B-SFT-LoRA",
-    learning_rate=2.0e-4,
-    per_device_train_batch_size=2,
-    num_train_epochs=1,
-)
-
-# Create trainer
+# Create trainer with PEFT config
 trainer = SFTTrainer(
     model=model,
     args=training_args,
     train_dataset=dataset,
-    peft_config=peft_config,
+    peft_config=peft_config,  # Pass PEFT config here
 )
 
-# Train
 trainer.train()
 ```
 
+</hfoption>
+<hfoption id="dpo">
+
 ### Direct Preference Optimization (DPO)
 
 The `DPOTrainer` implements preference learning from human feedback.
@@ -172,18 +162,9 @@ python trl/scripts/dpo.py \
 #### Python Example
 
 ```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import LoraConfig
 from trl import DPOConfig, DPOTrainer
 
-# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-
-# Load dataset
-dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
-
 # Configure LoRA
 peft_config = LoraConfig(
     r=32,
@@ -193,29 +174,23 @@ peft_config = LoraConfig(
     task_type="CAUSAL_LM",
 )
 
-# Training arguments
-training_args = DPOConfig(
-    output_dir="./Qwen2-0.5B-DPO-LoRA",
-    learning_rate=5.0e-6,
-    per_device_train_batch_size=2,
-)
-
-# Create trainer
-# When using PEFT, ref_model is automatically handled and set to None
+# Create trainer with PEFT config
 trainer = DPOTrainer(
     model=model,
     ref_model=None,  # Not needed when using PEFT
     args=training_args,
     train_dataset=dataset,
-    peft_config=peft_config,
+    peft_config=peft_config,  # Pass PEFT config here
 )
 
-# Train
 trainer.train()
 ```
 
 **Note:** When using PEFT with DPO, you don't need to provide a separate reference model (`ref_model`). The trainer automatically uses the frozen base model as the reference.
 
+</hfoption>
+<hfoption id="grpo">
+
 ### Group Relative Policy Optimization (GRPO)
 
 The `GRPOTrainer` optimizes policies using group-based rewards.
@@ -248,14 +223,9 @@ python trl/scripts/grpo.py \
 #### Python Example
 
 ```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import LoraConfig
 from trl import GRPOConfig, GRPOTrainer
 
-# Load dataset
-dataset = load_dataset("trl-lib/math-reasoning", split="train")
-
 # Configure LoRA
 peft_config = LoraConfig(
     r=32,
@@ -265,25 +235,20 @@ peft_config = LoraConfig(
     task_type="CAUSAL_LM",
 )
 
-# Training arguments
-training_args = GRPOConfig(
-    output_dir="./Qwen2-0.5B-GRPO-LoRA",
-    learning_rate=1.0e-5,
-    per_device_train_batch_size=2,
-)
-
-# Create trainer
+# Create trainer with PEFT config
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B",  # Can pass model name or loaded model
     args=training_args,
     train_dataset=dataset,
-    peft_config=peft_config,
+    peft_config=peft_config,  # Pass PEFT config here
 )
 
-# Train
 trainer.train()
 ```
 
+</hfoption>
+</hfoptions>
+
 ## QLoRA: Quantized Low-Rank Adaptation
 
 QLoRA combines 4-bit quantization with LoRA to enable fine-tuning of very large models on consumer hardware. This technique can reduce memory requirements by up to 4x compared to standard LoRA.
@@ -330,7 +295,7 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-# Load model in 4-bit
+# Load model with quantization
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-2-7b-hf",
     quantization_config=bnb_config,
@@ -346,15 +311,7 @@ peft_config = LoraConfig(
     task_type="CAUSAL_LM",
 )
 
-# Training arguments
-training_args = SFTConfig(
-    output_dir="./Llama-2-7b-QLoRA",
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=16,
-    learning_rate=2.0e-4,
-)
-
-# Create trainer
+# Create trainer with PEFT config
 trainer = SFTTrainer(
     model=model,
     args=training_args,
@@ -427,13 +384,9 @@ Prompt tuning is another PEFT technique that learns soft prompts (continuous emb
 ### Using Prompt Tuning with TRL
 
 ```python
-from transformers import AutoModelForCausalLM
-from peft import PromptTuningConfig, PromptTuningInit, get_peft_model, TaskType
+from peft import PromptTuningConfig, PromptTuningInit, TaskType
 from trl import SFTConfig, SFTTrainer
 
-# Load base model
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")
-
 # Configure Prompt Tuning
 peft_config = PromptTuningConfig(
     task_type=TaskType.CAUSAL_LM,
@@ -443,20 +396,12 @@ peft_config = PromptTuningConfig(
     tokenizer_name_or_path="Qwen/Qwen2-0.5B",
 )
 
-# Training arguments
-training_args = SFTConfig(
-    output_dir="./Qwen2-0.5B-PromptTuning",
-    per_device_train_batch_size=8,
-    learning_rate=3e-2,  # Prompt tuning typically uses higher learning rates
-    num_train_epochs=5,
-)
-
-# Create trainer
+# Create trainer with PEFT config
 trainer = SFTTrainer(
     model=model,
     args=training_args,
     train_dataset=dataset,
-    peft_config=peft_config,
+    peft_config=peft_config,  # Pass PEFT config here
 )
 
 trainer.train()