From 660e973b788011fab6a14446c037e23b39f72371 Mon Sep 17 00:00:00 2001
From: Behrooz <ermiaazarkhalili@gmail.com>
Date: Sun, 2 Nov 2025 12:00:48 -0800
Subject: [PATCH] docs: Extend CLI basic usage examples to all supported CLIs

Resolves #4378

- Add GRPO CLI examples with trl-lib/ultrafeedback-prompt dataset
- Add RLOO CLI examples with AI-MO/NuminaMath-TIR dataset
- Add KTO CLI examples with trl-lib/kto-mix-14k dataset
- Add examples to all sections: Basic Usage, Config Files, Accelerate, accelerate_config, and dataset mixtures
- Ensure parity in documentation coverage across all 6 training CLIs
- Verified CLI commands exist in trl/cli.py (lines 47-51)
- Verified datasets match official examples in examples/scripts/
---
 docs/source/clis.md | 279 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
diff --git a/docs/source/clis.md b/docs/source/clis.md
index 666584decf4..efe47e84f3c 100644
--- a/docs/source/clis.md
+++ b/docs/source/clis.md
@@ -53,6 +53,33 @@ trl reward \
   --dataset_name trl-lib/ultrafeedback_binarized
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback-prompt
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name AI-MO/NuminaMath-TIR
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k
+```
+
 </hfoption>
 </hfoptions>
 
@@ -105,6 +132,51 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback-prompt
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: AI-MO/NuminaMath-TIR
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -192,6 +264,84 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO inline">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback-prompt \
+  --num_processes 4
+```
+
+</hfoption>
+<hfoption id="GRPO w/ config file">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback-prompt
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO inline">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name AI-MO/NuminaMath-TIR \
+  --num_processes 4
+```
+
+</hfoption>
+<hfoption id="RLOO w/ config file">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: AI-MO/NuminaMath-TIR
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO inline">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k \
+  --num_processes 4
+```
+
+</hfoption>
+<hfoption id="KTO w/ config file">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -298,6 +448,84 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO inline">
+
+```bash
+trl grpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback-prompt \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="GRPO w/ config file">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback-prompt
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO inline">
+
+```bash
+trl rloo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name AI-MO/NuminaMath-TIR \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO w/ config file">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: AI-MO/NuminaMath-TIR
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO inline">
+
+```bash
+trl kto \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/kto-mix-14k \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO w/ config file">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/kto-mix-14k
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>
 
@@ -356,6 +584,57 @@ Launch with:
 trl reward --config reward_config.yaml
 ```
 
+</hfoption>
+<hfoption id="GRPO">
+
+```yaml
+# grpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: trl-lib/ultrafeedback-prompt
+  - path: BAAI/Infinity-Preference
+```
+
+Launch with:
+
+```bash
+trl grpo --config grpo_config.yaml
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```yaml
+# rloo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: AI-MO/NuminaMath-TIR
+  - path: deepmind/math_dataset
+```
+
+Launch with:
+
+```bash
+trl rloo --config rloo_config.yaml
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```yaml
+# kto_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: trl-lib/kto-mix-14k
+  - path: argilla/ultrafeedback-binarized-preferences-cleaned
+```
+
+Launch with:
+
+```bash
+trl kto --config kto_config.yaml
+```
+
 </hfoption>
 </hfoptions>