Reorder documentation TOC to surface key trainer sections (#4565)

qgallouedec · web-flow · commit 6f3a4529971e · 2025-11-24T08:11:49.000-07:00
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -12,6 +12,22 @@
   - local: paper_index
     title: Paper Index
   title: Conceptual Guides
+- sections: # Sorted alphabetically
+  - local: dpo_trainer
+    title: DPO
+  - local: online_dpo_trainer
+    title: Online DPO
+  - local: grpo_trainer
+    title: GRPO
+  - local: kto_trainer
+    title: KTO
+  - local: reward_trainer
+    title: Reward
+  - local: rloo_trainer
+    title: RLOO
+  - local: sft_trainer
+    title: SFT
+  title: Trainers
 - sections:
   - local: clis
     title: Command Line Interface (CLI)
@@ -55,20 +71,6 @@
     title: LoRA Without Regret
   title: Examples
 - sections:
-  - sections: # Sorted alphabetically
-    - local: dpo_trainer
-      title: DPO
-    - local: grpo_trainer
-      title: GRPO
-    - local: kto_trainer
-      title: KTO
-    - local: reward_trainer
-      title: Reward
-    - local: rloo_trainer
-      title: RLOO
-    - local: sft_trainer
-      title: SFT
-    title: Trainers
   - local: models
     title: Model Classes
   - local: model_utils
@@ -105,7 +107,7 @@
     title: GSPO-token
   - local: judges
     title: Judges
-  - local: minillm
+  - local: minillm_trainer
     title: MiniLLM
   - local: nash_md_trainer
     title: Nash-MD
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -22,34 +22,34 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
 
 ### Online methods
 
-- [`GRPOTrainer`] ⚡️
-- [`RLOOTrainer`] ⚡️
-- [`experimental.nash_md.NashMDTrainer`] 🧪 ⚡️
-- [`experimental.online_dpo.OnlineDPOTrainer`] 🧪 ⚡️
-- [`experimental.ppo.PPOTrainer`] 🧪
-- [`experimental.xpo.XPOTrainer`] 🧪 ⚡️
+- [`GRPOTrainer`](grpo_trainer) ⚡️
+- [`RLOOTrainer`](rloo_trainer) ⚡️
+- [`OnlineDPOTrainer`](online_dpo_trainer) 🧪 ⚡️
+- [`NashMDTrainer`](nash_md_trainer) 🧪 ⚡️
+- [`PPOTrainer`](ppo_trainer) 🧪
+- [`XPOTrainer`](xpo_trainer) 🧪 ⚡️
 
 ### Reward modeling
 
-- [`RewardTrainer`]
-- [`experimental.prm.PRMTrainer`] 🧪
+- [`RewardTrainer`](reward_trainer)
+- [`PRMTrainer`](prm_trainer) 🧪
 
 </div>
 <div style="flex: 1; min-width: 0;">
 
 ### Offline methods
 
-- [`SFTTrainer`]
-- [`DPOTrainer`]
-- [`KTOTrainer`]
-- [`experimental.bco.BCOTrainer`] 🧪
-- [`experimental.cpo.CPOTrainer`] 🧪
-- [`experimental.orpo.ORPOTrainer`] 🧪
+- [`SFTTrainer`](sft_trainer)
+- [`DPOTrainer`](dpo_trainer)
+- [`KTOTrainer`](kto_trainer)
+- [`BCOTrainer`](bco_trainer) 🧪
+- [`CPOTrainer`](cpo_trainer) 🧪
+- [`ORPOTrainer`](orpo_trainer) 🧪
 
 ### Knowledge distillation
 
-- [`experimental.gkd.GKDTrainer`] 🧪
-- [`experimental.minillm.MiniLLMTrainer`] 🧪
+- [`GKDTrainer`](gkd_trainer) 🧪
+- [`MiniLLMTrainer`](minillm_trainer) 🧪
 
 </div>
 </div>
diff --git a/docs/source/minillm_trainer.md b/docs/source/minillm_trainer.md