Address reviewer feedback on ORPO experimental migration

behroozazarkhalili · behroozazarkhalili · commit 18040a403757 · 2025-11-05T18:36:39.000-08:00
- Restore ORPO imports in trl/trainer/__init__.py for backward compatibility
- Fix deprecation stub naming from ExperimentalORPOTrainer to _ORPOTrainer
- Add torch import to deprecation stub for type hints
- Fix relative import paths in trl/experimental/orpo/orpo_trainer.py
- Update autodoc references to experimental.orpo.ORPOTrainer
- Update all documentation references to use experimental namespace
- Move ORPO test from test_trainers_args.py to experimental/test_trainers_args.py
diff --git a/docs/source/community_tutorials.md b/docs/source/community_tutorials.md
@@ -15,7 +15,7 @@ Community tutorials are made by active members of the Hugging Face community who
 | Instruction tuning | [`SFTTrainer`] | Fine-tuning Google Gemma LLMs using ChatML format with QLoRA | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-google-gemma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/gemma-lora-example.ipynb) |
 | Structured Generation | [`SFTTrainer`] | Fine-tuning Llama-2-7B to generate Persian product catalogs in JSON using QLoRA and PEFT | [Mohammadreza Esmaeilian](https://huggingface.co/Mohammadreza) | [Link](https://huggingface.co/learn/cookbook/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format.ipynb) |
 | Preference Optimization | [`DPOTrainer`] | Align Mistral-7b using Direct Preference Optimization for human preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/Fine_tune_Mistral_7b_with_DPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb) |
-| Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
+| Preference Optimization | [`experimental.orpo.ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
 | Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |
 
 ### Videos
diff --git a/docs/source/dataset_formats.md b/docs/source/dataset_formats.md
@@ -395,7 +395,7 @@ Choosing the right dataset type depends on the task you are working on and the s
 | [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
 | [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
 | [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
-| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`experimental.orpo.ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
 | [`PPOTrainer`] | Tokenized language modeling |
 | [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
 | [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
diff --git a/docs/source/example_overview.md b/docs/source/example_overview.md
@@ -54,7 +54,7 @@ Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl
 | [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/online_dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/online_dpo_vlm.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a a Vision Language Model. |
-| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
+| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`experimental.orpo.ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
 | [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
 | [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
 | [`examples/scripts/prm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/prm.py) | This script shows how to use the [`PRMTrainer`] to fine-tune a Process-supervised Reward Model (PRM). |
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -41,8 +41,8 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
 
 - [`SFTTrainer`]
 - [`DPOTrainer`]
-- [`ORPOTrainer`]
 - [`experimental.bco.BCOTrainer`] 🧪
+- [`experimental.orpo.ORPOTrainer`] 🧪
 - [`CPOTrainer`]
 - [`KTOTrainer`]
 
diff --git a/docs/source/orpo_trainer.md b/docs/source/orpo_trainer.md
@@ -79,9 +79,9 @@ Here are some other factors to consider when choosing a programming language for
 
 ## Expected dataset type
 
-ORPO requires a [preference dataset](dataset_formats#preference). The [`ORPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+ORPO requires a [preference dataset](dataset_formats#preference). The [`experimental.orpo.ORPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
-Although the [`ORPOTrainer`] supports both explicit and implicit prompts, we recommend using explicit prompts. If provided with an implicit prompt dataset, the trainer will automatically extract the prompt from the `"chosen"` and `"rejected"` columns. For more information, refer to the [preference style](dataset_formats#preference) section.
+Although the [`experimental.orpo.ORPOTrainer`] supports both explicit and implicit prompts, we recommend using explicit prompts. If provided with an implicit prompt dataset, the trainer will automatically extract the prompt from the `"chosen"` and `"rejected"` columns. For more information, refer to the [preference style](dataset_formats#preference) section.
 
 ## Example script
 
@@ -121,11 +121,11 @@ While training and evaluating, we record the following reward metrics:
 
 ## ORPOTrainer
 
-[[autodoc]] ORPOTrainer
+[[autodoc]] experimental.orpo.ORPOTrainer
     - train
     - save_model
     - push_to_hub
 
 ## ORPOConfig
 
-[[autodoc]] ORPOConfig
+[[autodoc]] experimental.orpo.ORPOConfig
diff --git a/tests/experimental/test_orpo_trainer.py b/tests/experimental/test_orpo_trainer.py
diff --git a/tests/experimental/test_trainers_args.py b/tests/experimental/test_trainers_args.py
@@ -16,6 +16,7 @@
 from transformers import AutoTokenizer
 
 from trl.experimental.bco import BCOConfig, BCOTrainer
+from trl.experimental.orpo import ORPOConfig, ORPOTrainer
 
 from ..testing_utils import TrlTestCase, require_sklearn
 
@@ -68,3 +69,30 @@ def test_bco(self):
         assert trainer.args.prompt_sample_size == 512
         assert trainer.args.min_density_ratio == 0.2
         assert trainer.args.max_density_ratio == 20.0
+
+    def test_orpo(self):
+        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
+        training_args = ORPOConfig(
+            self.tmp_dir,
+            max_length=256,
+            max_prompt_length=64,
+            max_completion_length=64,
+            beta=0.5,
+            disable_dropout=False,
+            label_pad_token_id=-99,
+            padding_value=-99,
+            truncation_mode="keep_start",
+            # generate_during_eval=True, # ignore this one, it requires wandb
+            is_encoder_decoder=True,
+            model_init_kwargs={"trust_remote_code": True},
+            dataset_num_proc=4,
+        )
+        trainer = ORPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
+        assert trainer.args.max_length == 256
+        assert trainer.args.max_prompt_length == 64
+        assert trainer.args.max_completion_length == 64
+        assert trainer.args.beta == 0.5
+        assert not trainer.args.disable_dropout
+        assert trainer.args.label_pad_token_id == -99
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
@@ -28,8 +28,6 @@
     NashMDTrainer,
     OnlineDPOConfig,
     OnlineDPOTrainer,
-    ORPOConfig,
-    ORPOTrainer,
     RewardConfig,
     RewardTrainer,
     SFTConfig,
@@ -248,33 +246,6 @@ def test_online_dpo(self, beta_list):
         assert trainer.args.beta == (0.6 if not beta_list else [0.6, 0.7])
         assert trainer.args.loss_type == "hinge"
 
-    def test_orpo(self):
-        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        training_args = ORPOConfig(
-            self.tmp_dir,
-            max_length=256,
-            max_prompt_length=64,
-            max_completion_length=64,
-            beta=0.5,
-            disable_dropout=False,
-            label_pad_token_id=-99,
-            padding_value=-99,
-            truncation_mode="keep_start",
-            # generate_during_eval=True, # ignore this one, it requires wandb
-            is_encoder_decoder=True,
-            model_init_kwargs={"trust_remote_code": True},
-            dataset_num_proc=4,
-        )
-        trainer = ORPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer)
-        assert trainer.args.max_length == 256
-        assert trainer.args.max_prompt_length == 64
-        assert trainer.args.max_completion_length == 64
-        assert trainer.args.beta == 0.5
-        assert not trainer.args.disable_dropout
-        assert trainer.args.label_pad_token_id == -99
-
     def test_reward(self):
         model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/trl/experimental/orpo/orpo_trainer.py b/trl/experimental/orpo/orpo_trainer.py
@@ -49,9 +49,9 @@
 from transformers.utils import is_peft_available, is_torch_fx_proxy
 
 from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
-from ..base_trainer import BaseTrainer
+from ...trainer.base_trainer import BaseTrainer
 from .orpo_config import ORPOConfig
-from ..utils import (
+from ...trainer.utils import (
     DPODataCollatorWithPadding,
     add_bos_token_if_needed,
     add_eos_token_if_needed,
diff --git a/trl/trainer/__init__.py b/trl/trainer/__init__.py
@@ -54,6 +54,8 @@
     "nash_md_trainer": ["NashMDTrainer"],
     "online_dpo_config": ["OnlineDPOConfig"],
     "online_dpo_trainer": ["OnlineDPOTrainer"],
+    "orpo_config": ["ORPOConfig"],
+    "orpo_trainer": ["ORPOTrainer"],
     "ppo_config": ["PPOConfig"],
     "ppo_trainer": ["PPOTrainer"],
     "prm_config": ["PRMConfig"],
@@ -112,6 +114,8 @@
     from .nash_md_trainer import NashMDTrainer
     from .online_dpo_config import OnlineDPOConfig
     from .online_dpo_trainer import OnlineDPOTrainer
+    from .orpo_config import ORPOConfig
+    from .orpo_trainer import ORPOTrainer
     from .ppo_config import PPOConfig
     from .ppo_trainer import PPOTrainer
     from .prm_config import PRMConfig
diff --git a/trl/trainer/orpo_trainer.py b/trl/trainer/orpo_trainer.py
@@ -16,6 +16,7 @@
 from collections.abc import Callable
 from typing import Any
 
+import torch
 import torch.nn as nn
 from datasets import Dataset
 from transformers import (
@@ -29,11 +30,11 @@
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalLoopOutput
 
-from ..experimental.orpo import ORPOTrainer as ExperimentalORPOTrainer
+from ..experimental.orpo import ORPOTrainer as _ORPOTrainer
 from .orpo_config import ORPOConfig
 
 
-class ORPOTrainer(ExperimentalORPOTrainer):
+class ORPOTrainer(_ORPOTrainer):
     """
     Initialize ORPOTrainer.