Move judges to experimental submodule (#4439)

behroozazarkhalili · qgallouedec · web-flow · commit 64cfca422973 · 2025-11-06T23:47:04.000-07:00
Co-authored-by: Quentin Gallouédec &lt;45557362+qgallouedec@users.noreply.github.com&gt;
Co-authored-by: Quentin Gallouédec &lt;gallouedec.quentin@gmail.com&gt;
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -87,8 +87,6 @@
     title: Model Classes
   - local: model_utils
     title: Model Utilities
-  - local: judges
-    title: Judges
   - local: callbacks
     title: Callbacks
   - local: data_utils
@@ -115,6 +113,8 @@
     title: GRPO With Replay Buffer
   - local: gspo_token
     title: GSPO-token
+  - local: judges
+    title: Judges
   - local: papo_trainer
     title: PAPO
   - local: xpo_trainer
diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
@@ -43,7 +43,7 @@ Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl
 | [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py) | This script shows how to use the [`CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
 | [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a Vision Language Model to reduce hallucinations using the [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) dataset. |
-| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
+| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`experimental.judges.OpenAIPairwiseJudge`] to judge model generations. |
 | [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
 | [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
diff --git a/docs/source/judges.md b/docs/source/judges.md
@@ -1,7 +1,7 @@
 # Judges
 
 > [!WARNING]
-> TRL Judges is an experimental API which is subject to change at any time.
+> TRL Judges is an experimental API which is subject to change at any time. As of TRL v1.0, judges have been moved to the `trl.experimental.judges` module.
 
 TRL provides judges to easily compare two completions.
 
@@ -13,10 +13,10 @@ pip install trl[judges]
 
 ## Using the provided judges
 
-TRL provides several judges out of the box. For example, you can use the [`HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:
+TRL provides several judges out of the box. For example, you can use the [`experimental.judges.HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:
 
 ```python
-from trl import HfPairwiseJudge
+from trl.experimental.judges import HfPairwiseJudge
 
 judge = HfPairwiseJudge()
 judge.judge(
@@ -27,12 +27,12 @@ judge.judge(
 
 ## Define your own judge
 
-To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`BaseRankJudge`] and implement the [`BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`BasePairJudge`] and implement the [`BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`BaseJudge`] and implement the [`BaseJudge.judge`] method.
+To define your own judge, we provide several base classes that you can subclass. For rank-based judges, you need to subclass [`experimental.judges.BaseRankJudge`] and implement the [`experimental.judges.BaseRankJudge.judge`] method. For pairwise judges, you need to subclass [`experimental.judges.BasePairJudge`] and implement the [`experimental.judges.BasePairJudge.judge`] method. If you want to define a judge that doesn't fit into these categories, you need to subclass [`experimental.judges.BaseJudge`] and implement the [`experimental.judges.BaseJudge.judge`] method.
 
 As an example, let's define a pairwise judge that prefers shorter completions:
 
 ```python
-from trl import BasePairwiseJudge
+from trl.experimental.judges import BasePairwiseJudge
 
 class PrefersShorterJudge(BasePairwiseJudge):
     def judge(self, prompts, completions, shuffle_order=False):
@@ -53,34 +53,34 @@ judge.judge(
 
 ### PairRMJudge
 
-[[autodoc]] PairRMJudge
+[[autodoc]] trl.experimental.judges.PairRMJudge
 
 ### HfPairwiseJudge
 
-[[autodoc]] HfPairwiseJudge
+[[autodoc]] trl.experimental.judges.HfPairwiseJudge
 
 ### OpenAIPairwiseJudge
 
-[[autodoc]] OpenAIPairwiseJudge
+[[autodoc]] trl.experimental.judges.OpenAIPairwiseJudge
 
 ### AllTrueJudge
 
-[[autodoc]] AllTrueJudge
+[[autodoc]] trl.experimental.judges.AllTrueJudge
 
 ## Base classes
 
 ### BaseJudge
 
-[[autodoc]] BaseJudge
+[[autodoc]] trl.experimental.judges.BaseJudge
 
 ### BaseBinaryJudge
 
-[[autodoc]] BaseBinaryJudge
+[[autodoc]] trl.experimental.judges.BaseBinaryJudge
 
 ### BaseRankJudge
 
-[[autodoc]] BaseRankJudge
+[[autodoc]] trl.experimental.judges.BaseRankJudge
 
 ### BasePairwiseJudge
 
-[[autodoc]] BasePairwiseJudge
+[[autodoc]] trl.experimental.judges.BasePairwiseJudge
diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
@@ -14,7 +14,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -28,7 +28,8 @@ Below is the script to train the model:
 ```python
 # train_nash_md.py
 from datasets import load_dataset
-from trl import NashMDConfig, NashMDTrainer, PairRMJudge
+from trl import NashMDConfig, NashMDTrainer
+from trl.experimental.judges import PairRMJudge
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -72,7 +73,7 @@ Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`Nas
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
@@ -14,7 +14,7 @@ This post-training method was contributed by [Michael Noukhovitch](https://huggi
 
 ## Quick start
 
-This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -28,7 +28,8 @@ Below is the script to train the model:
 ```python
 # train_online_dpo.py
 from datasets import load_dataset
-from trl import OnlineDPOConfig, OnlineDPOTrainer, PairRMJudge
+from trl import OnlineDPOConfig, OnlineDPOTrainer
+from trl.experimental.judges import PairRMJudge
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
@@ -74,7 +75,7 @@ Online DPO only requires a [prompt-only dataset](dataset_formats#prompt-only) (u
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
diff --git a/docs/source/xpo_trainer.md b/docs/source/xpo_trainer.md
@@ -17,7 +17,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`experimental.judges.PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
   frameborder="0"
@@ -30,7 +30,7 @@ Below is the script to train the model:
 ```python
 # train_xpo.py
 from datasets import load_dataset
-from trl import PairRMJudge
+from trl.experimental.judges import PairRMJudge
 from trl.experimental.xpo import XPOConfig, XPOTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -75,7 +75,7 @@ XPO requires a [prompt-only dataset](dataset_formats#prompt-only). The [`experim
 Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
 
 ```diff
-- from trl import PairRMJudge
+- from trl.experimental.judges import PairRMJudge
 + from transformers import AutoModelForSequenceClassification
 
 - judge = PairRMJudge()
diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py
@@ -24,7 +24,7 @@
 from transformers import HfArgumentParser
 from vllm import LLM, SamplingParams
 
-from trl import HfPairwiseJudge, OpenAIPairwiseJudge
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge
 
 
 """
diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
@@ -61,18 +61,16 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
     NashMDConfig,
     NashMDTrainer,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
@@ -56,19 +56,17 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
     OnlineDPOConfig,
     OnlineDPOTrainer,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
 
 
 # Enable logging in a Hugging Face Space
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
@@ -45,16 +45,14 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
-    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
-    OpenAIPairwiseJudge,
-    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_quantization_config,
 )
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
 from trl.experimental.xpo import XPOConfig, XPOTrainer
 
 
diff --git a/tests/experimental/test_judges.py b/tests/experimental/test_judges.py
@@ -17,9 +17,9 @@
 
 import pytest
 
-from trl import AllTrueJudge, HfPairwiseJudge, PairRMJudge
+from trl.experimental.judges import AllTrueJudge, HfPairwiseJudge, PairRMJudge
 
-from .testing_utils import RandomBinaryJudge, TrlTestCase, require_llm_blender
+from ..testing_utils import RandomBinaryJudge, TrlTestCase, require_llm_blender
 
 
 class TestJudges(TrlTestCase):
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
@@ -22,14 +22,14 @@
 from transformers.utils import is_peft_available
 
 from trl import (
-    BasePairwiseJudge,
     BEMACallback,
     DPOConfig,
     DPOTrainer,
     LogCompletionsCallback,
     MergeModelCallback,
     WinRateCallback,
 )
+from trl.experimental.judges import BasePairwiseJudge
 from trl.mergekit_utils import MergeConfig
 
 from .testing_utils import TrlTestCase, require_comet, require_mergekit, require_peft, require_wandb
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -32,7 +32,7 @@
     is_vision_available,
 )
 
-from trl import BaseBinaryJudge, BasePairwiseJudge
+from trl.experimental.judges import BaseBinaryJudge, BasePairwiseJudge
 from trl.import_utils import (
     is_joblib_available,
     is_liger_kernel_available,
diff --git a/trl/experimental/judges/__init__.py b/trl/experimental/judges/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .judges import (
+    AllTrueJudge,
+    BaseBinaryJudge,
+    BaseJudge,
+    BasePairwiseJudge,
+    BaseRankJudge,
+    HfPairwiseJudge,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
+)
+
+
+__all__ = [
+    "AllTrueJudge",
+    "BaseBinaryJudge",
+    "BaseJudge",
+    "BasePairwiseJudge",
+    "BaseRankJudge",
+    "HfPairwiseJudge",
+    "OpenAIPairwiseJudge",
+    "PairRMJudge",
+]
diff --git a/trl/experimental/judges/judges.py b/trl/experimental/judges/judges.py
diff --git a/trl/experimental/xpo/xpo_trainer.py b/trl/experimental/xpo/xpo_trainer.py
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py