diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index fd3ab04d6ca..70009f37cac 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -95,8 +95,6 @@
title: Model Classes
- local: model_utils
title: Model Utilities
- - local: best_of_n
- title: Best of N Sampling
- local: judges
title: Judges
- local: callbacks
diff --git a/docs/source/best_of_n.md b/docs/source/best_of_n.md
deleted file mode 100644
index 9280a9e2008..00000000000
--- a/docs/source/best_of_n.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning
-
-> [!WARNING]
-> Best-of-N sampling is deprecated and will be removed in TRL 0.25.0.
-
-Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
-As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
-
-## Usage
-
-To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries
-
-```python
-from transformers import pipeline, AutoTokenizer
-from trl import AutoModelForCausalLMWithValueHead
-from trl.core import LengthSampler
-from trl.extras import BestOfNSampler
-
-ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
-reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)
-tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
-tokenizer.pad_token = tokenizer.eos_token
-
-# callable that takes a list of raw text and returns a list of corresponding reward scores
-def queries_to_scores(list_of_strings):
- return [output["score"] for output in reward_pipe(list_of_strings)]
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler)
-```
-
-And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method
-
-```python
-best_of_n.generate(query_tensors, device=device, **gen_kwargs)
-```
-
-The default sample size is 4, but you can change it at the time of instance initialization like so
-
-```python
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8)
-```
-
-The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization
-
-```python
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2)
-```
-
-There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method.
-This is done by passing a [`~transformers.GenerationConfig`] from the `transformers` library at the time of initialization
-
-```python
-
-from transformers import GenerationConfig
-
-generation_config = GenerationConfig(min_length= -1, top_k=0.0, top_p= 1.0, do_sample= True, pad_token_id=tokenizer.eos_token_id)
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, generation_config=generation_config)
-
-best_of_n.generate(query_tensors, device=device)
-
-```
-
-Furthermore, at the time of initialization you can set the seed to control the repeatability of the generation process and the number of samples to generate for each query
-
-## BestOfNSampler
-
-[[autodoc]] BestOfNSampler
diff --git a/tests/test_best_of_n_sampler.py b/tests/test_best_of_n_sampler.py
deleted file mode 100644
index d52538c71d0..00000000000
--- a/tests/test_best_of_n_sampler.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-from transformers import AutoTokenizer, GenerationConfig
-
-from trl import AutoModelForCausalLMWithValueHead
-from trl.core import LengthSampler
-from trl.extras import BestOfNSampler
-
-from .testing_utils import TrlTestCase
-
-
-def queries_to_scores(list_of_strings):
- return [torch.rand(1).item() for _ in list_of_strings]
-
-
-class TestBestOfNSampler(TrlTestCase):
- """
- Tests the BestOfNSampler class
- """
-
- ref_model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
- output_length_sampler = LengthSampler(2, 6)
- model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
- tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
- tokenizer.pad_token = tokenizer.eos_token
- output_length_sampler = LengthSampler(2, 6)
-
- def test_different_input_types(self):
- r"""
- Tests if the different input types normalizer works
- """
-
- generation_config = GenerationConfig(
- min_length=-1,
- top_k=0.0,
- top_p=1.0,
- do_sample=True,
- pad_token_id=self.tokenizer.eos_token_id,
- )
-
- output_length_sampler = LengthSampler(2, 6)
-
- best_of_n = BestOfNSampler(
- self.model,
- self.tokenizer,
- queries_to_scores,
- length_sampler=output_length_sampler,
- generation_config=generation_config,
- )
-
- queries = ["hello world", "goodbye world"]
- tokenized_queries = [self.tokenizer.encode(query) for query in queries]
-
- various_queries_formats = [
- (tokenized_queries[0], 1),
- (tokenized_queries, 2),
- (torch.tensor(tokenized_queries[1]), 1),
- ([torch.tensor(query) for query in tokenized_queries], 2),
- ]
-
- for q, expected_length in various_queries_formats:
- results = best_of_n.generate(q)
- assert isinstance(results, list)
- assert len(results) == expected_length
-
- def test_different_sample_sizes_and_n_candidates_values(self):
- r"""
- Tests different sample sizes and n_candidates values
- """
- generation_config = GenerationConfig(
- min_length=-1,
- top_k=0.0,
- top_p=1.0,
- do_sample=True,
- pad_token_id=self.tokenizer.eos_token_id,
- )
-
- output_length_sampler = LengthSampler(6, 10)
-
- for sample_value, n_candidates_values, expected in [
- (4, 2, 2),
- (10, 3, 3),
- (6, 4, 4),
- ]:
- best_of_n = BestOfNSampler(
- self.model,
- self.tokenizer,
- queries_to_scores,
- length_sampler=output_length_sampler,
- generation_config=generation_config,
- sample_size=sample_value,
- n_candidates=n_candidates_values,
- )
-
- queries = ["hello world", "troll the world"]
- tokenized_queries = [self.tokenizer.encode(query) for query in queries]
- results = best_of_n.generate(tokenized_queries)
- for result in results:
- assert len(result) == expected
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 973d4105433..60d9b9dcefb 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -30,7 +30,6 @@
DataCollatorForChatML,
RepeatSampler,
batch_generation,
- decode_and_strip_padding,
entropy_from_logits,
flush_left,
flush_right,
@@ -170,21 +169,6 @@ def test_create_peft_config_use_peft_true(self):
assert getattr(peft_config, arg) == value
-class TestDecodeAndStripPadding(TrlTestCase):
- def setup_method(self):
- self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
-
- def test_example_with_padding(self):
- inputs = self.tokenizer(["Hello world", "Hello"], padding=True, return_tensors="pt")
- decoded = decode_and_strip_padding(inputs["input_ids"], self.tokenizer)
- assert decoded == ["Hello world", "Hello"]
-
- def test_example_without_padding(self):
- inputs = self.tokenizer(["Hello", "Hello"], padding=False, return_tensors="pt")
- decoded = decode_and_strip_padding(inputs["input_ids"], self.tokenizer)
- assert decoded == ["Hello", "Hello"]
-
-
class TestGenerateModelCard(TrlTestCase):
def test_full(self):
model_card = generate_model_card(
diff --git a/trl/__init__.py b/trl/__init__.py
index 6eabf828078..8babb49039e 100644
--- a/trl/__init__.py
+++ b/trl/__init__.py
@@ -53,7 +53,6 @@
"truncate_dataset",
"unpair_preference_dataset",
],
- "extras": ["BestOfNSampler"],
"models": [
"SUPPORTED_ARCHITECTURES",
"AutoModelForCausalLMWithValueHead",
@@ -133,7 +132,6 @@
truncate_dataset,
unpair_preference_dataset,
)
- from .extras import BestOfNSampler
from .models import (
SUPPORTED_ARCHITECTURES,
AutoModelForCausalLMWithValueHead,
diff --git a/trl/extras/__init__.py b/trl/extras/__init__.py
index fddcdf1af1a..a3170185781 100644
--- a/trl/extras/__init__.py
+++ b/trl/extras/__init__.py
@@ -11,19 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-from typing import TYPE_CHECKING
-
-from ..import_utils import _LazyModule
-
-
-_import_structure = {
- "best_of_n_sampler": ["BestOfNSampler"],
-}
-
-if TYPE_CHECKING:
- from .best_of_n_sampler import BestOfNSampler
-else:
- import sys
-
- sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/trl/extras/best_of_n_sampler.py b/trl/extras/best_of_n_sampler.py
deleted file mode 100644
index f7505042404..00000000000
--- a/trl/extras/best_of_n_sampler.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Any, Callable, Optional, Union
-
-import torch
-from transformers import GenerationConfig, PreTrainedTokenizer, PreTrainedTokenizerFast, set_seed
-
-from ..models import SUPPORTED_ARCHITECTURES, PreTrainedModelWrapper
-
-
-class BestOfNSampler:
- """
- Sampler for best-of-n generation.
-
- Args:
- model ([`PreTrainedModelWrapper`]):
- The pretrained model to use for generation.
- tokenizer ([`~transformers.PreTrainedTokenizer`] or [`~transformers.PreTrainedTokenizerFast`]):
- Tokenizer associated with the pretrained model.
- queries_to_scores (`Callable[[list[str]], list[float]]`):
- Callable that takes a list of generated texts and returns the associated reward scores.
- length_sampler (`Any`):
- Sampler used to sample the length of the generated text.
- sample_size (`int`, *optional*, defaults to `4`):
- Number of samples to generate for each query.
- seed (`int`, *optional*):
- Random seed used to control generation.
- n_candidates (`int`, *optional*, defaults to `1`):
- Number of candidates to return for each query.
- generation_config ([`~transformers.GenerationConfig`], *optional*):
- Generation config passed to the underlying model's `generate` method. See
- [`~transformers.GenerationConfig`] for more details.
-
-
-
- `BestOfNSampler` is deprecated and will be removed in version 0.25.
-
-
- """
-
- warnings.warn("`BestOfNSampler` is deprecated and will be removed in TRL 0.25.", FutureWarning, stacklevel=2)
-
- def __init__(
- self,
- model: PreTrainedModelWrapper,
- tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
- queries_to_scores: Callable[[list[str]], list[float]],
- length_sampler: Any,
- sample_size: int = 4,
- seed: Optional[int] = None,
- n_candidates: int = 1,
- generation_config: Optional[GenerationConfig] = None,
- ) -> None:
- if seed is not None:
- set_seed(seed)
-
- if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
- raise ValueError(
- f"tokenizer must be a PreTrainedTokenizer or PreTrainedTokenizerFast, got {type(tokenizer)}"
- )
- if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
- raise ValueError(
- f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}"
- )
-
- self.model = model
- self.tokenizer = tokenizer
-
- self.queries_to_scores = queries_to_scores
- self.length_sampler = length_sampler
- self.gen_config = generation_config
- self.sample_size = sample_size
- self.n_candidates = n_candidates
-
- def generate(
- self,
- tokenized_query: Union[list[int], torch.Tensor, list[torch.Tensor], list[list[int]]],
- skip_special_tokens: bool = True,
- device: Optional[Union[str, torch.device]] = None,
- **generation_kwargs,
- ) -> list[list[str]]:
- """
- Generate the best of n samples for input queries.
-
- Args:
- tokenized_query (`list[int]` or `torch.Tensor` or `list[torch.Tensor]` or `list[list[int]]`):
- Either a single tokenized query (a single tensor or a list of integers) or a batch of tokenized queries
- (a list of tensors or a list of lists of integers).
- skip_special_tokens (`bool`, *optional*, defaults to `True`):
- Whether to remove the special tokens from the output.
- device (`str` or `torch.device`, *optional*):
- The device on which the model will be loaded.
- **generation_kwargs:
- Additional keyword arguments passed along to the underlying model's `generate` method. This is used to
- override generation config.
-
- Returns:
- `list[list[str]]`: A list of lists of generated texts.
- """
- queries = None
-
- if isinstance(tokenized_query, torch.Tensor) and tokenized_query.ndim == 1:
- queries = tokenized_query.unsqueeze(0)
- elif isinstance(tokenized_query, list):
- element_type = type(tokenized_query[0])
- if element_type is int:
- queries = torch.tensor(tokenized_query).unsqueeze(0)
- elif element_type is torch.Tensor:
- queries = [tensor.reshape((1, -1)) for tensor in tokenized_query]
- else:
- queries = [torch.tensor(query).reshape((1, -1)) for query in tokenized_query]
-
- result = []
-
- for query in queries:
- queries = query.repeat((self.sample_size, 1))
- output = self.model.generate(
- queries.to(device),
- max_new_tokens=self.length_sampler(),
- generation_config=self.gen_config,
- **generation_kwargs,
- ).squeeze()
- output = self.tokenizer.batch_decode(output, skip_special_tokens=skip_special_tokens)
- scores = torch.tensor(self.queries_to_scores(output))
- output = [output[i] for i in scores.topk(self.n_candidates).indices]
- result.append(output)
-
- return result
diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index b9e73a78ff4..740fdb6353a 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -219,7 +219,7 @@ class DPOConfig(TrainingArguments):
- This parameter is deprecated and will be removed in version 0.25.0. Use `pad_token` (`str`) instead.
+ This parameter is deprecated and will be removed in version 0.26.0. Use `pad_token` (`str`) instead.
"""
diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
index ad2c20be7a7..d8cc1ca8a2e 100644
--- a/trl/trainer/nash_md_trainer.py
+++ b/trl/trainer/nash_md_trainer.py
@@ -94,14 +94,6 @@ class NashMDTrainer(OnlineDPOTrainer):
The optimizer and scheduler to use for training.
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
The function to use to preprocess the logits before computing the metrics.
-
- reward_model:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
-
-
"""
_tag_names = ["trl", "nash-md"]
@@ -139,8 +131,6 @@ def __init__(
callbacks: Optional[list[TrainerCallback]] = None,
optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
- # Deprecated parameters
- reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
) -> None:
super().__init__(
model=model,
@@ -158,7 +148,6 @@ def __init__(
callbacks=callbacks,
optimizers=optimizers,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
- reward_model=reward_model,
)
self._mixture_coef = self.args.mixture_coef
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index 08ed1a6700d..2e28ec13e81 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -59,17 +59,6 @@ class may differ from those in [`~transformers.TrainingArguments`].
- `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
- `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
-
- dataset_num_proc (`int`, *optional*):
- Number of processes to use for processing the dataset.
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Since OnlineDPO does not involve
- dataset preparation, you can safely remove it.
-
-
-
disable_dropout (`bool`, *optional*, defaults to `True`):
Whether to disable dropout in the model and reference model.
@@ -380,36 +369,11 @@ class may differ from those in [`~transformers.TrainingArguments`].
},
)
- # Deprecated parameters
- dataset_num_proc: Optional[int] = field(
- default=None,
- metadata={"help": "Number of processes to use for processing the dataset."},
- )
- gpu_memory_utilization: Optional[float] = field(
- default=None,
- metadata={
- "help": "This parameter is deprecated and will be removed in version 0.25.0. Please use "
- "`vllm_gpu_memory_utilization` instead.",
- },
- )
-
def __post_init__(self):
self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
super().__post_init__()
- if self.dataset_num_proc is not None:
- warnings.warn(
- "The parameter `dataset_num_proc` is deprecated and will be removed in version 0.25.0. "
- "Since OnlineDPO does not involve dataset preparation, you can safely remove it.",
- )
- if self.gpu_memory_utilization is not None:
- warnings.warn(
- "The parameter `gpu_memory_utilization` is deprecated and will be removed in version 0.25.0. "
- "Please use `vllm_gpu_memory_utilization` instead.",
- )
- self.vllm_gpu_memory_utilization = self.gpu_memory_utilization
-
if hasattr(self.beta, "__len__") and len(self.beta) == 1:
self.beta = self.beta[0]
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
index 70c649bdf6e..a35955980aa 100644
--- a/trl/trainer/online_dpo_trainer.py
+++ b/trl/trainer/online_dpo_trainer.py
@@ -164,14 +164,6 @@ class OnlineDPOTrainer(BaseTrainer):
The optimizer and scheduler to use for training.
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
The function to use to preprocess the logits before computing the metrics.
-
- reward_model:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
-
-
"""
_tag_names = ["trl", "online-dpo"]
@@ -206,9 +198,6 @@ def __init__(
callbacks: Optional[list[TrainerCallback]] = None,
optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
- # Deprecated parameters
- reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
- reward_processing_class: Optional[PreTrainedTokenizerBase] = None,
) -> None:
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
@@ -225,36 +214,6 @@ def __init__(
self.ref_model = ref_model
- # Handle deprecated parameters for backward compatibility
- if reward_model is not None:
- warnings.warn(
- "The `reward_model` parameter is deprecated and will be removed in version 0.25.0. "
- "Please use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.",
- )
- # Convert old reward_model to new reward_funcs format
- if reward_funcs is None:
- reward_funcs = reward_model
- else:
- warnings.warn(
- "Both `reward_model` and `reward_funcs` are provided. Using `reward_funcs` and ignoring "
- "`reward_model`.",
- )
-
- if reward_processing_class is not None:
- warnings.warn(
- "The `reward_processing_class` parameter is deprecated and will be removed in version 0.25.0. "
- "Please use `reward_processing_classes` instead. For example, change "
- "`reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.",
- )
- # Convert old reward_processing_class to new reward_processing_classes format
- if reward_processing_classes is None:
- reward_processing_classes = reward_processing_class
- else:
- warnings.warn(
- "Both `reward_processing_class` and `reward_processing_classes` are provided. Using "
- "`reward_processing_classes` and ignoring `reward_processing_class`.",
- )
-
# Validate reward configuration - must have exactly one of: judge, or reward_funcs
reward_configs = sum(x is not None for x in [judge, reward_funcs])
if reward_configs == 0:
@@ -329,16 +288,7 @@ def __init__(
self.reward_weights = None
if args.missing_eos_penalty is not None and reward_funcs is None and judge is None:
- # Check if this is the old reward_model case
- if reward_model is not None:
- logger.warning(
- "The `missing_eos_penalty` parameter is deprecated when used with the deprecated `reward_model` parameter. "
- "Please use `reward_funcs` instead of `reward_model` to continue using this feature.",
- FutureWarning,
- stacklevel=2,
- )
- else:
- raise ValueError("`missing_eos_penalty` is only supported when `reward_funcs` is provided.")
+ raise ValueError("`missing_eos_penalty` is only supported when `reward_funcs` is provided.")
if args is None:
raise ValueError("`args` must be provided.")
@@ -1330,7 +1280,7 @@ def training_step(
if is_conversational({"prompt": prompts[0]}):
completions = [[{"role": "assistant", "content": completion}] for completion in completions]
- # Get the reward from reward functions, judge, or deprecated reward_model
+ # Get the reward from reward functions or judge
if self.reward_funcs is not None:
# First create completion_ids_list for custom reward functions
completion_ids_list = [completion_ids[i].tolist() for i in range(completion_ids.shape[0])]
diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
index eb27ca1f9a7..335cc72fd09 100644
--- a/trl/trainer/rloo_config.py
+++ b/trl/trainer/rloo_config.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import warnings
from dataclasses import dataclass, field
from typing import Optional, Union
@@ -193,142 +192,6 @@ class RLOOConfig(TrainingArguments):
wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
are logged.
-
- > Deprecated parameters
-
- rloo_k:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `num_generations` instead.
-
-
-
- cliprange:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `epsilon` instead.
-
-
-
- kl_coef:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `beta` instead.
-
-
-
- exp_name:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `run_name` instead.
-
-
-
- normalize_reward:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `normalize_advantages` instead.
-
-
-
- num_ppo_epochs:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `num_iterations` instead.
-
-
-
- num_mini_batches:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `steps_per_generation` instead.
-
-
-
- total_episodes:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `max_steps` instead.
-
-
-
- response_length:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `max_completion_length` instead.
-
-
-
- token_level_kl:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. KL is now computed only at the sequence
- level.
-
-
-
- dataset_num_proc:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. This parameter was unused, you can
- safely remove it from your scripts.
-
-
-
- local_rollout_forward_batch_size:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Now it is automatically set to
- `per_device_train_batch_size` (or `per_device_eval_batch_size` during evaluation).
-
-
-
- num_sample_generations:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `logging_steps` to control
- generation logging frequency.
-
-
-
- stop_token:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0.
-
-
-
- stop_token_id:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `processing_class.eos_token_id`
- instead.
-
-
-
- missing_eos_penalty:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Replicate with a custom reward function
- checking if `eos_token_id` is in `completion_ids`.
-
-
"""
_VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
@@ -656,125 +519,9 @@ class RLOOConfig(TrainingArguments):
},
)
- # Deprecated params
- rloo_k: Optional[int] = field(
- default=None,
- metadata={"help": "Deprecated: use `num_generations` instead."},
- )
- cliprange: Optional[float] = field(
- default=None,
- metadata={"help": "Deprecated: use `epsilon` instead."},
- )
- kl_coef: Optional[float] = field(
- default=None,
- metadata={"help": "Deprecated: use `beta` instead."},
- )
- exp_name: Optional[str] = field(
- default=None,
- metadata={"help": "Deprecated: use `run_name` instead."},
- )
- normalize_reward: Optional[bool] = field(
- default=None,
- metadata={"help": "Deprecated: use `normalize_advantages` instead."},
- )
- num_ppo_epochs: Optional[int] = field(
- default=None,
- metadata={"help": "Deprecated: use `num_iterations` instead."},
- )
- num_mini_batches: Optional[int] = field(
- default=None,
- metadata={"help": "Deprecated: use `steps_per_generation` instead."},
- )
- total_episodes: Optional[int] = field(
- default=None,
- metadata={"help": "Deprecated: use `max_steps=total_episodes/(gradient_accumulation_steps*rloo_k)` instead."},
- )
- response_length: Optional[int] = field(
- default=None,
- metadata={"help": "Deprecated: use `max_completion_length` instead."},
- )
- token_level_kl: Optional[bool] = field(
- default=None,
- metadata={"help": "Removed: KL is now computed only at the sequence level."},
- )
- dataset_num_proc: Optional[int] = field(
- default=None,
- metadata={"help": "Removed: this parameter was unused, you can safely remove it from your scripts."},
- )
- local_rollout_forward_batch_size: Optional[int] = field(
- default=None,
- metadata={
- "help": "Removed: now automatically set to `per_device_train_batch_size` (or `per_device_eval_batch_size` "
- "during evaluation)."
- },
- )
- num_sample_generations: Optional[int] = field(
- default=None,
- metadata={"help": "Removed: use `logging_steps` to control generation logging frequency."},
- )
- stop_token: Optional[str] = field(
- default=None,
- metadata={"help": "Removed."},
- )
- stop_token_id: Optional[int] = field(
- default=None,
- metadata={"help": "Removed: use `processing_class.eos_token_id` instead."},
- )
- missing_eos_penalty: Optional[float] = field(
- default=None,
- metadata={
- "help": "Removed: replicate with a custom reward function checking if `eos_token_id` is in "
- "`completion_ids`."
- },
- )
-
def __post_init__(self):
self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
- _DEPRECATED_PARAMS = {
- "rloo_k": "num_generations",
- "cliprange": "epsilon",
- "kl_coef": "beta",
- "exp_name": "run_name",
- "normalize_reward": "normalize_advantages",
- "num_ppo_epochs": "num_iterations",
- "num_mini_batches": "steps_per_generation",
- "total_episodes": "max_steps",
- "response_length": "max_completion_length",
- }
-
- _REMOVED_PARAMS = {
- "token_level_kl",
- "dataset_num_proc",
- "local_rollout_forward_batch_size",
- "num_sample_generations",
- "stop_token",
- "stop_token_id",
- "missing_eos_penalty",
- }
-
- # Check for deprecated parameters and issue warnings
- for old_param, new_param in _DEPRECATED_PARAMS.items():
- if getattr(self, old_param) is not None:
- old_value = getattr(self, old_param)
- if old_param == "total_episodes":
- old_value = old_value // self.gradient_accumulation_steps
- warnings.warn(
- f"Parameter '{old_param}' is deprecated and will be removed in version 0.25.0. Please use "
- f"'{new_param}' instead. We are setting {new_param}={old_value}"
- )
- # Set the new parameter with the old value
- setattr(self, new_param, old_value)
- # Clear the deprecated parameter
- setattr(self, old_param, None)
-
- for removed_param in _REMOVED_PARAMS:
- if hasattr(self, removed_param) and getattr(self, removed_param) is not None:
- warnings.warn(
- f"Parameter '{removed_param}' is deprecated and will be removed in version 0.25.0. Please refer "
- "to the migration guide: https://huggingface.co/docs/trl/en/rloo_trainer##migration-guide-from-the-old-implementation-021-and-below"
- )
-
super().__post_init__()
num_processes = self.world_size
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index 59fc9c285e0..a6ff57aa8f2 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -192,47 +192,6 @@ def reward_func(completions, **kwargs):
model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
peft_config ([`~peft.PeftConfig`], *optional*):
PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
-
- config:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `args` instead.
-
-
-
- reward_model:
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
-
-
-
- policy:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `model` instead.
-
-
-
- ref_policy:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. To use the initial model as the
- reference model, simply omit this parameter. The parameter is ignored.
-
-
-
- data_collator:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. The RLOOTrainer does not use a data
- collator, so this parameter is ignored.
-
-
"""
_tag_names = ["trl", "rloo"]
@@ -255,9 +214,8 @@ def reward_func(completions, **kwargs):
def __init__(
self,
- # Note for dev: we can remove the default None when we remove the deprecated model parameter in version 0.25.0
- model: Union[str, PreTrainedModel] = None,
- reward_funcs: Union[RewardFunc, list[RewardFunc]] = None,
+ model: Union[str, PreTrainedModel],
+ reward_funcs: Union[RewardFunc, list[RewardFunc]],
args: Optional[RLOOConfig] = None,
train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
@@ -266,12 +224,6 @@ def __init__(
callbacks: Optional[list[TrainerCallback]] = None,
optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
peft_config: Optional["PeftConfig"] = None,
- # Deprecated parameters
- config=None,
- reward_model=None,
- policy=None,
- ref_policy=None,
- data_collator=None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
@@ -280,70 +232,6 @@ def __init__(
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
- # Handle deprecated parameters
- if config is not None:
- warnings.warn(
- "Parameter 'config' is deprecated and will be removed in version 0.25.0. Please use 'args' instead. "
- "We are setting args=config"
- )
- if args is None:
- args = config
- else:
- raise ValueError("Cannot specify both 'config' (deprecated) and 'args'. Please use 'args' only.")
-
- if reward_model is not None:
- warnings.warn(
- "Parameter 'reward_model' is deprecated and will be removed in version 0.25.0. Please use "
- "'reward_funcs' instead. We are setting reward_funcs=reward_model"
- )
- if reward_funcs is None:
- reward_funcs = reward_model
- else:
- raise ValueError(
- "Cannot specify both 'reward_model' (deprecated) and 'reward_funcs'. Please use 'reward_funcs' "
- "only."
- )
- if policy is not None:
- warnings.warn(
- "Parameter 'policy' is deprecated and will be removed in version 0.25.0. Please use 'model' instead. "
- "We are setting model=policy"
- )
- if model is None:
- model = policy
- else:
- raise ValueError("Cannot specify both 'policy' (deprecated) and 'model'. Please use 'model' only.")
- if ref_policy is not None:
- warnings.warn(
- "Parameter 'ref_policy' is deprecated and will be removed in version 0.25.0. To use the initial model "
- "as the reference model, simply omit this parameter. The parameter is ignored."
- )
- if data_collator is not None:
- warnings.warn(
- "Parameter 'data_collator' is deprecated and will be removed in version 0.25.0. The RLOOTrainer does "
- "not use a data collator, so this parameter is ignored."
- )
- if "input_ids" in train_dataset.column_names:
- warnings.warn(
- "The training dataset contains a column named 'input_ids', indicating that it is pre-tokenized. "
- "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide "
- "the raw dataset (conversational or standard) with a 'prompt' column instead."
- )
-
- def decode(example, tokenizer):
- return {"prompt": tokenizer.decode(example["input_ids"])}
-
- train_dataset = train_dataset.map(decode, fn_kwargs={"tokenizer": processing_class})
- if eval_dataset is not None and "input_ids" in eval_dataset.column_names:
- warnings.warn(
- "The evaluation dataset contains a column named 'input_ids', indicating that it is pre-tokenized. "
- "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide "
- "the raw dataset (conversational or standard) with a 'prompt' column instead."
- )
-
- def decode(example, tokenizer):
- return {"prompt": tokenizer.decode(example["input_ids"])}
-
- eval_dataset = eval_dataset.map(decode, fn_kwargs={"tokenizer": processing_class})
# Args
if args is None:
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index 7c547687dcc..f335166c527 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -1251,34 +1251,6 @@ def empty_cache() -> None:
torch.cuda.empty_cache()
-def decode_and_strip_padding(inputs: torch.Tensor, tokenizer: PreTrainedTokenizerBase) -> list[str]:
- # docstyle-ignore
- """
- Decodes the input tensor and strips the padding tokens.
-
- > [!WARNING]
- > This function is deprecated and will be removed in a version 0.25.0. If you want to keep using it, please copy
- > the code into your codebase and use it from there.
-
- Args:
- inputs (`torch.Tensor`):
- The input tensor to be decoded.
- tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
- The tokenizer used to decode the input tensor.
-
- Returns:
- `list[str]`:
- The list of decoded strings with padding tokens stripped.
- """
- warnings.warn(
- "The function `decode_and_strip_padding` is deprecated and will be removed in a version 0.25.0. If you want "
- "to keep using it, please copy the code into your codebase and use it from there.",
- FutureWarning,
- )
- decoded = tokenizer.batch_decode(inputs, skip_special_tokens=False)
- return [d.replace(tokenizer.pad_token, "") for d in decoded]
-
-
def generate_model_card(
base_model: Optional[str],
model_name: str,
diff --git a/trl/trainer/xpo_trainer.py b/trl/trainer/xpo_trainer.py
index e289bce5bb9..0dfdddad550 100644
--- a/trl/trainer/xpo_trainer.py
+++ b/trl/trainer/xpo_trainer.py
@@ -93,14 +93,6 @@ class XPOTrainer(OnlineDPOTrainer):
The optimizer and scheduler to use for training.
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
The function to use to preprocess the logits before computing the metrics.
-
- reward_model:
-
-
-
- This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
-
-
"""
_tag_names = ["trl", "xpo"]
@@ -137,15 +129,12 @@ def __init__(
callbacks: Optional[list[TrainerCallback]] = None,
optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
- # Deprecated parameters
- reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
) -> None:
super().__init__(
model=model,
ref_model=ref_model,
judge=judge,
reward_funcs=reward_funcs,
- reward_model=reward_model,
args=args,
data_collator=data_collator,
train_dataset=train_dataset,