Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions tests/test_dpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1513,5 +1513,23 @@ def test_vdpo_trainer(self, model_id):
self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12), f"Param {n} is not updated")


class TestDPOConfig(TrlTestCase):
@parameterized.expand([(f_div_type, as_str) for f_div_type in list(FDivergenceType) for as_str in [False, True]])
def test_f_divergence_type(self, f_divergence_type, as_string: bool):
training_args = DPOConfig(
output_dir=self.tmp_dir,
report_to="none",
f_divergence_type=f_divergence_type.value if as_string else f_divergence_type,
)

# Internal normalization: keep Enum member
assert isinstance(training_args.f_divergence_type, FDivergenceType)
assert training_args.f_divergence_type == f_divergence_type

# Serialization: TrainingArguments.to_dict should yield the enum's string value
configparser_dict = training_args.to_dict()
assert configparser_dict["f_divergence_type"] == f_divergence_type.value


if __name__ == "__main__":
unittest.main()
3 changes: 2 additions & 1 deletion tests/test_trainers_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
CPOTrainer,
DPOConfig,
DPOTrainer,
FDivergenceType,
KTOConfig,
KTOTrainer,
NashMDConfig,
Expand Down Expand Up @@ -192,7 +193,7 @@ def test_dpo(self):
self.assertEqual(trainer.args.ref_adapter_name, "dummy_adapter")
self.assertEqual(trainer.args.reference_free, True)
self.assertEqual(trainer.args.force_use_ref_model, True)
self.assertEqual(trainer.args.f_divergence_type, "js_divergence")
self.assertEqual(trainer.args.f_divergence_type, FDivergenceType.JS_DIVERGENCE)
self.assertEqual(trainer.args.f_alpha_divergence_coef, 0.5)
# self.assertEqual(trainer.args.sync_ref_model, True)
self.assertEqual(trainer.args.ref_model_mixup_alpha, 0.5)
Expand Down
5 changes: 3 additions & 2 deletions trl/trainer/dpo_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class DPOConfig(TrainingArguments):
Parameter controlling the deviation from the reference model. Higher β means less deviation from the
reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
the [paper](https://huggingface.co/papers/2310.12036).
f_divergence_type ([`FDivergenceType`], *optional*, defaults to `FDivergenceType.REVERSE_KL`):
f_divergence_type ([`FDivergenceType`] or `str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
Type of f-divergence regularization function to compute divergence between policy and reference model.
f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`):
α coefficient in the α-divergence u^-α regularization function for DPO loss.
Expand Down Expand Up @@ -396,7 +396,7 @@ class DPOConfig(TrainingArguments):
"Higher β means less deviation from the reference model."
},
)
f_divergence_type: FDivergenceType = field(
f_divergence_type: Union[FDivergenceType, str] = field(
default=FDivergenceType.REVERSE_KL,
metadata={
"help": "Type of f-divergence regularization function to compute divergence between policy and reference "
Expand Down Expand Up @@ -496,6 +496,7 @@ class DPOConfig(TrainingArguments):

def __post_init__(self):
self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
self.f_divergence_type = FDivergenceType(self.f_divergence_type)

# Normalize loss_type to string format for internal use
if hasattr(self.loss_type, "__len__") and len(self.loss_type) == 1:
Expand Down
4 changes: 2 additions & 2 deletions trl/trainer/dpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def dpo_loss(
chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)

if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE:
# The alpha-divergence formula: (1 - u^-alpha) / alpha
# The divergence difference between the chosen and rejected sample is:
# (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
Expand All @@ -1087,7 +1087,7 @@ def dpo_loss(
ref_logratios = ref_logratios.to(self.accelerator.device)
logits = logratios - ref_logratios

if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE:
# The js-divergence formula: log(2 * u / (1 + u))
# The divergence difference between the chosen and rejected sample is:
# log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
Expand Down
Loading