huggingface
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/lora_without_regret.md‎
Lines changed: 2 additions & 94 deletions b/‎docs/source/lora_without_regret.md‎
Lines changed: 2 additions & 94 deletions
diff --git a/‎docs/source/rewards.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/rewards.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/test_rewards.py‎
Lines changed: 67 additions & 18 deletions b/‎tests/test_rewards.py‎
Lines changed: 67 additions & 18 deletions
diff --git a/‎trl/rewards/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎trl/rewards/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -104,6 +104,9 @@ trainer = GRPOTrainer(
 trainer.train()
 ```
 
+> [NOTE!]
+> For reasoning models, use the `reasoning_accuracy_reward()` function for better results.
+
 ### `DPOTrainer`
 
 [`DPOTrainer`](https://huggingface.co/docs/trl/dpo_trainer) implements the popular [Direct Preference Optimization (DPO) algorithm](https://huggingface.co/papers/2305.18290) that was used to post-train [Llama 3](https://huggingface.co/papers/2407.21783) and many other models. Here is a basic example of how to use the `DPOTrainer`:
 
@@ -135,94 +135,6 @@ The blog post performs GRPO on a range of models and datasets from the Hub, and
 
 For reinforcement learning, the blog uses a math reasoning task that we can reproduce as a Python function.
 
-<details>
-<summary>Reward function</summary>
-
-```python
-def strip_reasoning_accuracy_reward(
-    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
-) -> list[float | None]:
-    """Reward function that strips reasoning tags and checks mathematical accuracy.
-
-    This function:
-    1. Extracts the content from completions
-    2. Removes <think></think> tags (for reasoning that shouldn't be evaluated)
-    3. Parses both the gold solution and the predicted answer
-    4. Uses math_verify to check if they are mathematically equivalent
-
-    Args:
-        completions: List of model completions, each containing a list of messages
-        solution: List of ground truth solutions
-        **kwargs: Additional arguments (ignored but required for trainer compatibility)
-
-    Returns:
-        List of rewards where:
-        - 1.0 if the answer is correct
-        - 0.0 if the answer is incorrect
-        - None if the solution is not parseable (skips this example)
-    """
-    contents = [completion[0]["content"] for completion in completions]
-    rewards = []
-
-    for content, sol in zip(contents, solution):
-        # Strip reasoning tags from completion
-        while "<think>" in content and "</think>" in content:
-            start = content.find("<think>")
-            end = content.find("</think>", start)
-            if start != -1 and end != -1:
-                content = content[:start] + content[end + len("</think>") :]
-            else:
-                break
-
-        # Parse gold solution
-        gold_parsed = parse(
-            f"${sol}$",
-            extraction_config=[
-                LatexExtractionConfig(
-                    boxed_match_priority=0, try_extract_without_anchor=True
-                )
-            ],
-        )
-
-        if len(gold_parsed) != 0:
-            # We require the answer to be provided in correct latex (no malformed operators)
-            answer_parsed = parse(
-                content,
-                extraction_config=[
-                    LatexExtractionConfig(
-                        boxed_match_priority=0,
-                        normalization_config=NormalizationConfig(
-                            basic_latex=True,
-                            units=True,
-                            malformed_operators=False,
-                            nits=False,
-                            boxed=True,
-                        ),
-                        try_extract_without_anchor=False,
-                    )
-                ],
-                extraction_mode="first_match",
-            )
-
-            # Compute binary rewards if verifiable, `None` otherwise to skip this example
-            try:
-                reward = float(verify(gold_parsed, answer_parsed))
-            except Exception as e:
-                print(
-                    f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
-                )
-                reward = None
-        else:
-            # If the gold solution is not parseable, we assign `None` to skip this example
-            reward = None
-
-        rewards.append(reward)
-
-    return rewards
-```
-
-</details>
-
 <hfoptions id="grpo">
 <hfoption id="python">
 
@@ -233,14 +145,10 @@ We can implement these recommendations with the TRL Python API like so:
 from datasets import load_dataset
 from peft import LoraConfig
 from trl import GRPOConfig, GRPOTrainer
+from trl.rewards import reasoning_accuracy_reward
 
 dataset = load_dataset("HuggingFaceH4/OpenR1-Math-220k-default-verified", split="train")
 
-def strip_reasoning_accuracy_reward(completions, **kwargs):
-    """Reward function that strips reasoning and accuracy scores from the model outputs."""
-
-    ... 
-
 peft_config = LoraConfig(
     r=1,
     lora_alpha=32,
@@ -259,7 +167,7 @@ training_args = GRPOConfig(
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen3-0.6B",
-    reward_funcs=strip_reasoning_accuracy_reward,
+    reward_funcs=reasoning_accuracy_reward,
     args=training_args,
     train_dataset=dataset,
     peft_config=peft_config,
 
@@ -6,6 +6,10 @@ This module contains some useful reward functions, primarily intended for use wi
 
 [[autodoc]] rewards.accuracy_reward
 
+## reasoning_accuracy_reward
+
+[[autodoc]] rewards.reasoning_accuracy_reward
+
 ## think_format_reward
 
 [[autodoc]] rewards.think_format_reward
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from trl.rewards import accuracy_reward, get_soft_overlong_punishment, think_format_reward
+from trl.rewards import accuracy_reward, get_soft_overlong_punishment, reasoning_accuracy_reward, think_format_reward
 
 from .testing_utils import TrlTestCase, require_math_latex
 
@@ -117,27 +117,76 @@ def test_accuracy_reward_unparseable_gold(self):
         """Test accuracy_reward with an unparseable gold solution."""
         completion = [
             [{"content": "Answer is forty two."}],
-            [{"content": "Some other content."}],
-            [{"content": r"Answer is \boxed{42}."}],
-            [{"content": r"Answer is \boxed{\mathbf{42}}."}],  # Make response bold
-            [{"content": r"Answer is \boxed{\textbf{42}}."}],  # Different latex command for bold
-            [{"content": r"Answer is \boxed{42}."}],
-            [{"content": r"Answer is \boxed{42.3456}."}],
+            [{"content": r"Some other content. \boxed{43}."}],
         ]
         solution = [
             "Answer is forty two.",
             "Answer is forty three.",
-            "Answer is 42.0",  # Decimal point
-            "Answer is 42 43 okay?",  # Extra space
-            "Answer is 42",
-            r"Answer is \n\boxed{42}",  # Newline in gold solution
-            "Answer is 42.34560",  # Extra trailing zero
         ]
         rewards = accuracy_reward(completion, solution)
-        assert rewards[0] == 1.0  # Should revert to exact text match
+        assert rewards[0] is None
+        assert rewards[1] is None
+
+
+class TestReasoningAccuracyReward:
+    @require_math_latex
+    def test_correct_answer_yields_unit_reward(self):
+        completions = [
+            [{"content": r"<think> Reasoning content </think> \boxed{\frac{63}{400}}"}],
+            [{"content": r"Reasoning content </think> \boxed{\frac{63}{400}}"}],
+        ]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+        rewards = reasoning_accuracy_reward(completions, solutions)
+        assert rewards[0] == 1.0
+        assert rewards[1] == 1.0
+
+    @require_math_latex
+    def test_correct_answer_with_custom_tags_yields_unit_reward(self):
+        completions = [
+            [{"content": r"<REASONING_START> Reasoning content </REASONING_END> \boxed{\frac{63}{400}}"}],
+        ]
+        solutions = [
+            r"\frac{63}{400}",
+        ]
+        rewards = reasoning_accuracy_reward(completions, solutions, reasoning_delimiters=["</REASONING_END>"])
+        assert rewards[0] == 1.0
+
+    @require_math_latex
+    def test_incorrect_answer_yields_zero_reward(self):
+        completion = [[{"content": r"<think> Reasoning content </think> \boxed{\frac{64}{400}}"}]]
+        solution = [r"\frac{63}{400}"]
+        rewards = reasoning_accuracy_reward(completion, solution)
+        assert rewards[0] == 0.0
+
+    @require_math_latex
+    def test_correct_answer_in_reasoning_yields_zero_reward(self):
+        completions = [
+            [{"content": r"<think> My answer is \boxed{42} </think> Some other text."}],
+            [{"content": r"<think> The answer is \boxed{42} </think> Here's a wrong answer: \boxed{43}."}],
+        ]
+        solutions = [r"\boxed{42}", r"\boxed{42}"]
+        rewards = reasoning_accuracy_reward(completions, solutions)
+        assert rewards[0] == 0.0
+        assert rewards[1] == 0.0
+
+    @require_math_latex
+    def test_incomplete_reasoning_yields_zero_reward(self):
+        completions = [
+            [{"content": r"<think> Incomplete reasoning without closing tag"}],
+            [{"content": r"Correct answer \frac{63}{400} but completely missing reasoning content"}],
+        ]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+        rewards = reasoning_accuracy_reward(completions, solutions)
+        assert rewards[0] == 0.0
         assert rewards[1] == 0.0
-        assert rewards[2] == 1.0
-        assert rewards[3] == 1.0
-        assert rewards[4] == 1.0
-        assert rewards[5] == 1.0
-        assert rewards[6] == 1.0  # Should ignore trailing zeros
+
+    @require_math_latex
+    def test_unparseable_gold_solution_yields_none_reward(self):
+        completions = [
+            [{"content": r"<think> Reasoning content </think> \boxed{42}"}],
+        ]
+        solutions = [
+            "forty two",
+        ]
+        rewards = reasoning_accuracy_reward(completions, solutions)
+        assert rewards[0] is None
@@ -19,14 +19,14 @@
 
 
 _import_structure = {
-    "accuracy_rewards": ["accuracy_reward"],
+    "accuracy_rewards": ["accuracy_reward", "reasoning_accuracy_reward"],
     "format_rewards": ["think_format_reward"],
     "other_rewards": ["get_soft_overlong_punishment"],
 }
 
 
 if TYPE_CHECKING:
-    from .accuracy_rewards import accuracy_reward
+    from .accuracy_rewards import accuracy_reward, reasoning_accuracy_reward
     from .format_rewards import think_format_reward
     from .other_rewards import get_soft_overlong_punishment