huggingface
diff --git a/‎.github/workflows/docker-build.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/docker-build.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 8 deletions b/‎README.md‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎docker/trl-dev/Dockerfile‎
Lines changed: 2 additions & 3 deletions b/‎docker/trl-dev/Dockerfile‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docker/trl/Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎docker/trl/Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/_toctree.yml‎
Lines changed: 18 additions & 12 deletions b/‎docs/source/_toctree.yml‎
Lines changed: 18 additions & 12 deletions
@@ -13,7 +13,8 @@ concurrency:
 jobs:
   trl:
     name: "Build and push TRL Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -52,7 +53,8 @@ jobs:
 
   trl-dev:
     name: "Build and push TRL Dev Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
@@ -21,11 +21,11 @@
 
 **OpenEnv Integration:** TRL now supports **[OpenEnv](https://huggingface.co/blog/openenv)**, the open-source framework from Meta for defining, deploying, and interacting with environments in reinforcement learning and agentic workflows.
 
-Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](openenv).
+Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](https://huggingface.co/docs/trl/openenv).
 
 ## Overview
 
-TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Proximal Policy Optimization (PPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
+TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Group Realtive Policy Optimization (GRPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
 
 ## Highlights
 
@@ -92,16 +92,13 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_chars,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
 
@@ -1,6 +1,5 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
 RUN uv pip install --system --no-cache "git+https://github.com/huggingface/trl.git#egg=trl[liger,peft,vlm]"
-RUN uv pip install --system hf_transfer liger_kernel trackio peft
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN uv pip install --system kernels liger_kernel peft trackio
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
-RUN uv pip install --system trl[liger,peft,vlm] hf_transfer trackio
-RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN uv pip install --system trl[liger,peft,vlm] kernels trackio
@@ -56,22 +56,14 @@
   title: Examples
 - sections:
   - sections: # Sorted alphabetically
-    - local: cpo_trainer
-      title: CPO
     - local: dpo_trainer
       title: DPO
     - local: online_dpo_trainer
       title: Online DPO
-    - local: gkd_trainer
-      title: GKD
     - local: grpo_trainer
       title: GRPO
     - local: kto_trainer
       title: KTO
-    - local: nash_md_trainer
-      title: Nash-MD
-    - local: ppo_trainer
-      title: PPO
     - local: prm_trainer
       title: PRM
     - local: reward_trainer
@@ -80,15 +72,11 @@
       title: RLOO
     - local: sft_trainer
       title: SFT
-    - local: xpo_trainer
-      title: XPO
     title: Trainers
   - local: models
     title: Model Classes
   - local: model_utils
     title: Model Utilities
-  - local: judges
-    title: Judges
   - local: callbacks
     title: Callbacks
   - local: data_utils
@@ -107,14 +95,32 @@
     title: BEMA for Reference Model
   - local: bco_trainer
     title: BCO
+  - local: cpo_trainer
+    title: CPO
   - local: gfpo
     title: GFPO
+  - local: gkd_trainer
+    title: GKD
   - local: gold_trainer
     title: GOLD
   - local: grpo_with_replay_buffer
     title: GRPO With Replay Buffer
   - local: gspo_token
     title: GSPO-token
+  - local: judges
+    title: Judges
+  - local: minillm
+    title: MiniLLM
+  - local: nash_md_trainer
+    title: Nash-MD
+  - local: orpo_trainer
+    title: ORPO
+  - local: papo_trainer
+    title: PAPO
+  - local: ppo_trainer
+    title: PPO
+  - local: xpo_trainer
+    title: XPO
   - local: openenv
     title: OpenEnv Integration
   - local: orpo_trainer