Merge branch 'main' into Issues_#4403

Harras3 · web-flow · commit 6d6a60358acd · 2025-11-04T17:33:25.000+05:00
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -605,3 +605,47 @@ def add_margin(example):
 
 dataset = dataset.map(add_margin)
 ```
+
+## Distillation
+Papers relating to training a student model with the help of a teacher model.
+
+### On-Policy Distillation
+**📰 Blog**: https://thinkingmachines.ai/blog/on-policy-distillation/
+
+On-Policy Distillation involves a student model generating rollouts for each batch of training data. We subsequently obtain the probability distributions for each token of the rollouts from both the student and teacher models. The student model is then optimized to minimize the negative Kullback-Leibler (KL) divergence between its own token distributions and those of the teacher model.
+
+| Method                  | Sampling   | Reward signal |
+|-------------------------|------------|---------------|
+| Supervised finetuning   | off-policy | dense         |
+| Reinforcement learning  | on-policy  | sparse        |
+| On-policy distillation  | on-policy  | dense         |
+
+On-Policy Distillation has been shown to outperform SFT, GRPO and can be used to restore generalization capabilities lost during SFT.
+
+Additionally on-policy distillation is more compute efficient and is less prone to overfitting when trained with limited data.
+
+To train a model with on-policy distillation using TRL, you can use the following configuration, with the [`GKDTrainer`] and [`GKDConfig`]:
+
+```python
+from trl import GKDConfig
+
+config = GKDConfig(
+    lmbda=1.0, # student produces rollouts for all batches
+    beta=1.0, # to ensure reverse-kl as the loss function
+    teacher_model_name_or_path="teacher-model", # specify the teacher model
+
+)
+```
+
+Alternatively, you can use the [`GOLDTrainer`] and [`GOLDConfig`] to perform on-policy distillation with a similar configuration:
+
+```python
+from trl.experimental import GOLDConfig
+
+config = GOLDConfig(
+    lmbda=1.0, # student produces rollouts for all batches
+    beta=1.0, # to ensure reverse-kl as the loss function
+    teacher_model_name_or_path="teacher-model", # specify the teacher model
+
+)
+```
diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
@@ -64,7 +64,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 from trl import CPOConfig, CPOTrainer, ModelConfig, ScriptArguments, get_peft_config
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -90,8 +89,6 @@
     # Dataset
     ################
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     ################
     # Training
diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
@@ -73,7 +73,6 @@
     get_kbit_device_map,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -128,8 +127,6 @@
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
@@ -69,7 +69,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -131,8 +130,6 @@
         trust_remote_code=model_args.trust_remote_code,
         **model_kwargs,
     )
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token = tokenizer.eos_token
 
diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
@@ -64,7 +64,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 from trl import ModelConfig, ORPOConfig, ORPOTrainer, ScriptArguments, get_peft_config
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -91,8 +90,6 @@
     # Dataset
     ################
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     ################
     # Training
diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
@@ -43,7 +43,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -106,8 +105,6 @@
         model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
     )
     tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
         training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
     )
diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
@@ -43,7 +43,6 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -113,8 +112,6 @@
         model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
     )
     tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
     value_model = AutoModelForSequenceClassification.from_pretrained(
         training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
     )
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
@@ -57,7 +57,6 @@
     get_kbit_device_map,
     get_quantization_config,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 # Enable logging in a Hugging Face Space
@@ -113,8 +112,6 @@
     )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
diff --git a/tests/test_cpo_trainer.py b/tests/test_cpo_trainer.py
@@ -17,7 +17,6 @@
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
 from trl import CPOConfig, CPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -33,15 +32,13 @@ def setup_method(self):
         model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration"
         self.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
         self.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.t5_tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     @pytest.mark.parametrize(
         "name, loss_type, config_name",
         [
             ("qwen", "sigmoid", "standard_preference"),
             ("t5", "hinge", "standard_implicit_prompt_preference"),
             ("qwen", "ipo", "conversational_preference"),
-            ("t5", "ipo", "conversational_implicit_prompt_preference"),
             ("qwen", "simpo", "standard_preference"),
             ("t5", "simpo", "standard_implicit_prompt_preference"),
             ("qwen", "hinge", "conversational_preference"),
diff --git a/tests/test_gkd_trainer.py b/tests/test_gkd_trainer.py
@@ -21,7 +21,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 
 from trl import GKDConfig, GKDTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_liger_kernel
 
@@ -206,10 +205,6 @@ def setup_method(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        # Ensure the tokenizer has a chat template
-        if not hasattr(self.tokenizer, "chat_template") or self.tokenizer.chat_template is None:
-            self.tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
     def test_gkd_trainer(self):
         training_args = GKDConfig(
             output_dir=self.tmp_dir,
diff --git a/tests/test_orpo_trainer.py b/tests/test_orpo_trainer.py
@@ -17,7 +17,6 @@
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
 
 from trl import ORPOConfig, ORPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -33,15 +32,13 @@ def setup_method(self):
         model_id = "trl-internal-testing/tiny-T5ForConditionalGeneration"
         self.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
         self.t5_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.t5_tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
 
     @pytest.mark.parametrize(
         "name, config_name",
         [
             ("qwen", "standard_preference"),
             ("t5", "standard_implicit_prompt_preference"),
             ("qwen", "conversational_preference"),
-            ("t5", "conversational_implicit_prompt_preference"),
         ],
     )
     def test_orpo_trainer(self, name, config_name):
diff --git a/tests/test_ppo_trainer.py b/tests/test_ppo_trainer.py
@@ -19,7 +19,6 @@
 from transformers.utils import is_peft_available
 
 from trl import PPOConfig, PPOTrainer
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from .testing_utils import TrlTestCase, require_peft
 
@@ -37,9 +36,6 @@ def setup_method(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left")
         self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
-        if self.tokenizer.chat_template is None:
-            self.tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
         # Add reward and value models as in ppo.py
         reward_model_id = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5"
         self.value_model = AutoModelForSequenceClassification.from_pretrained(reward_model_id, num_labels=1)
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -738,9 +738,6 @@ def print_rich_table(df: pd.DataFrame) -> None:
     console.print(table)
 
 
-SIMPLE_SFT_CHAT_TEMPLATE = "{% for message in messages %}{{' ' + message['content']}}{% endfor %}{{ eos_token }}"
-# SIMPLE_SFT_CHAT_TEMPLATE simply ends things with an EOS token, this helps the SFT model learn to end the completions with EOS tokens
-
 SIMPLE_CHAT_TEMPLATE = "{% for message in messages %}{{message['role'].capitalize() + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
 
 

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,6 @@`
`73`	`73`	`get_kbit_device_map,`
`74`	`74`	`get_quantization_config,`
`75`	`75`	`)`
`76`		`-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE`
`77`	`76`
`78`	`77`
`79`	`78`	`# Enable logging in a Hugging Face Space`
`@@ -128,8 +127,6 @@`
`128`	`127`	`)`
`129`	`128`	`if tokenizer.pad_token is None:`
`130`	`129`	`tokenizer.pad_token = tokenizer.eos_token`
`131`		`- if tokenizer.chat_template is None:`
`132`		`- tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE`
`133`	`130`
`134`	`131`	`dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)`
`135`	`132`
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,6 @@`
`69`	`69`	`get_peft_config,`
`70`	`70`	`get_quantization_config,`
`71`	`71`	`)`
`72`		`-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE`
`73`	`72`
`74`	`73`
`75`	`74`	`# Enable logging in a Hugging Face Space`
`@@ -131,8 +130,6 @@`
`131`	`130`	`trust_remote_code=model_args.trust_remote_code,`
`132`	`131`	`**model_kwargs,`
`133`	`132`	`)`
`134`		`- if tokenizer.chat_template is None:`
`135`		`- tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE`
`136`	`133`	`if tokenizer.pad_token_id is None:`
`137`	`134`	`tokenizer.pad_token = tokenizer.eos_token`
`138`	`135`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,6 @@`
`43`	`43`	`get_peft_config,`
`44`	`44`	`get_quantization_config,`
`45`	`45`	`)`
`46`		`-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE`
`47`	`46`
`48`	`47`
`49`	`48`	`# Enable logging in a Hugging Face Space`
`@@ -106,8 +105,6 @@`
`106`	`105`	`model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code`
`107`	`106`	`)`
`108`	`107`	`tokenizer.add_special_tokens({"pad_token": "[PAD]"})`
`109`		`- if tokenizer.chat_template is None:`
`110`		`- tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE`
`111`	`108`	`value_model = AutoModelForSequenceClassification.from_pretrained(`
`112`	`109`	`training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1`
`113`	`110`	`)`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,6 @@`
`57`	`57`	`get_kbit_device_map,`
`58`	`58`	`get_quantization_config,`
`59`	`59`	`)`
`60`		`-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE`
`61`	`60`
`62`	`61`
`63`	`62`	`# Enable logging in a Hugging Face Space`
`@@ -113,8 +112,6 @@`
`113`	`112`	`)`
`114`	`113`	`if tokenizer.pad_token is None:`
`115`	`114`	`tokenizer.pad_token = tokenizer.eos_token`
`116`		`- if tokenizer.chat_template is None:`
`117`		`- tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE`
`118`	`115`
`119`	`116`	`dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)`
`120`	`117`