From e0decfde3b37fd48b2175648b97754750a48cd38 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 10:53:28 +0000
Subject: [PATCH 01/33] Add stuff

---
 .gitignore                                    |  4 +-
 .../grpo/config_demo.yaml                     | 50 ++++++++++++++
 src/open_r1/grpo.py                           |  2 +
 src/open_r1/rewards.py                        | 66 +++++++++++++++++++
 4 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml

diff --git a/.gitignore b/.gitignore
index d44c47f6..f4db7819 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,4 +175,6 @@ data/
 wandb/
 logs/
 eval_results/
-results/
\ No newline at end of file
+results/
+
+.vscode/
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
new file mode 100644
index 00000000..19f4d6f5
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
@@ -0,0 +1,50 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+# Num processes is less by 1 as vLLM is using 1 GPU
+num_processes: 7
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_completions: true
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 1024
+max_steps: -1
+num_generations: 7
+num_train_epochs: 1
+output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO
+overwrite_output_dir: true
+per_device_eval_batch_size: 32
+per_device_train_batch_size: 16
+push_to_hub: false #true
+report_to:
+- wandb
+reward_funcs:
+- code_reward
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 128375db..f7bd036e 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -31,6 +31,7 @@
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
     reasoning_steps_reward,
+    code_reward
 )
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.logging import init_wandb_training
@@ -162,6 +163,7 @@ def main(script_args, training_args, model_args):
             ngram_size=script_args.repetition_n_grams,
             max_penalty=script_args.repetition_max_penalty,
         ),
+        "code_reward": code_reward,
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index bec3d11c..19d18442 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -5,6 +5,7 @@
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
+from e2b_code_interpreter import Sandbox
 
 
 def accuracy_reward(completions, solution, **kwargs):
@@ -197,3 +198,68 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
         return rewards
 
     return repetition_penalty_reward
+
+
+def extract_code(completion : str) -> str:
+    pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(completion)
+    extracted_answer = matches[-1] if len(matches) >= 1 else ""
+    return extracted_answer
+
+import json
+
+def code_reward(completions, **kwargs):
+    from e2b_code_interpreter import Sandbox
+
+    sbx = Sandbox()
+    """Returns a reward function that evaluates code snippets in a sandbox."""
+    evaluation_script_template = """import subprocess
+    import json
+
+    def evaluate_code(code, test_cases):
+        passed = 0
+        total = len(test_cases)
+
+        for case in test_cases:
+            process = subprocess.run(
+                ["python3", "-c", code],
+                input=case["input"],
+                text=True,
+                capture_output=True
+            )
+
+            if process.returncode != 0:  # Error in execution
+                continue
+
+            output = process.stdout.strip()
+            if output == case["output"]:
+                passed += 1
+
+        success_rate = (passed / total)
+
+    code_snippet = {code}
+    test_cases = json.loads({test_cases})
+
+    evaluate_code(code_snippet, test_cases)
+    """
+    code_snippets = [extract_code(completion) for completion in completions]
+    test_cases = kwargs["verification_info"]["test_cases"]
+    scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(test_cases))) for code in code_snippets]
+    rewards = []
+    for script in scripts:
+        execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data)) # Execute Python inside the sandbox
+
+        output = ""
+        if len(execution.logs.stdout) > 0:
+            output += "\n".join(execution.logs.stdout)
+        if len(execution.logs.stderr) > 0:
+            output += "\n".join(execution.logs.stderr)
+        if execution.error is not None:
+            output += execution.error.traceback
+
+        # convert output to float
+        output = float(output)
+        rewards.append(output)
+    return rewards
+
+

From da197834db0b1502df4baa5e5952e4a3e7daae36 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 12:14:41 +0000
Subject: [PATCH 02/33] Make it kind of work

---
 .../grpo/config_demo.yaml                     | 20 +++++------
 src/open_r1/grpo.py                           |  2 +-
 src/open_r1/rewards.py                        | 35 ++++++++++++-------
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
index 19f4d6f5..c8433c1a 100644
--- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
@@ -5,7 +5,7 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-dataset_name: AI-MO/NuminaMath-TIR
+dataset_name: open-r1/verifiable-coding-problems-python-1k
 dataset_configs:
 - all
 # Num processes is less by 1 as vLLM is using 1 GPU
@@ -17,34 +17,34 @@ use_vllm: true
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.7
 do_eval: true
-eval_strategy: steps
+eval_strategy: 'no' #steps
 eval_steps: 100
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO
 hub_strategy: every_save
-learning_rate: 2.0e-05
+learning_rate: 1.0e-06
 log_completions: true
 log_level: info
-logging_steps: 5
+logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: cosine
-max_prompt_length: 512
-max_completion_length: 1024
+max_prompt_length: 1024
+max_completion_length: 2048
 max_steps: -1
 num_generations: 7
 num_train_epochs: 1
 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 32
-per_device_train_batch_size: 16
+per_device_train_batch_size: 1
 push_to_hub: false #true
 report_to:
 - wandb
 reward_funcs:
-- code_reward
+- code
+- format
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 30e60832..3d8de0dc 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -165,7 +165,7 @@ def main(script_args, training_args, model_args):
             max_penalty=script_args.repetition_max_penalty,
         ),
         "length": len_reward,
-        "code_reward": code_reward,
+        "code": code_reward,
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 639cde5b..edb00ad5 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -283,11 +283,14 @@ def extract_code(completion : str) -> str:
 import json
 
 def code_reward(completions, **kwargs):
+    from dotenv import load_dotenv
+    load_dotenv()
     from e2b_code_interpreter import Sandbox
 
     sbx = Sandbox()
     """Returns a reward function that evaluates code snippets in a sandbox."""
-    evaluation_script_template = """import subprocess
+    evaluation_script_template = """
+    import subprocess
     import json
 
     def evaluate_code(code, test_cases):
@@ -310,30 +313,38 @@ def evaluate_code(code, test_cases):
                 passed += 1
 
         success_rate = (passed / total)
+        return success_rate
 
     code_snippet = {code}
     test_cases = json.loads({test_cases})
 
     evaluate_code(code_snippet, test_cases)
     """
-    code_snippets = [extract_code(completion) for completion in completions]
-    test_cases = kwargs["verification_info"]["test_cases"]
-    scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(test_cases))) for code in code_snippets]
+    code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
+    verification_info = kwargs["verification_info"]
+    scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) for code, info in zip(code_snippets, verification_info)]
     rewards = []
     for script in scripts:
-        execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data)) # Execute Python inside the sandbox
+        print(f"=== Script ===\n{script}\n=== End of Script ===")
 
-        output = ""
-        if len(execution.logs.stdout) > 0:
-            output += "\n".join(execution.logs.stdout)
-        if len(execution.logs.stderr) > 0:
-            output += "\n".join(execution.logs.stderr)
-        if execution.error is not None:
-            output += execution.error.traceback
+        execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data), request_timeout=3) # Execute Python inside the sandbox
+
+        print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
+
+        output = execution.text
+
+        # if len(execution.logs.stdout) > 0:
+        #     output += "\n".join(execution.logs.stdout)
+        # if len(execution.logs.stderr) > 0:
+        #     output += "\n".join(execution.logs.stderr)
+        # if execution.error is not None:
+        #     output += execution.error.traceback
 
         # convert output to float
         output = float(output)
         rewards.append(output)
+    
+    print(f"Rewards: {rewards}")
     return rewards
 
 

From 6ba5302e42814b6aed22a89096adcf0d67819a51 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 14:15:17 +0000
Subject: [PATCH 03/33] Add more stuff

---
 .../grpo/config_demo.yaml                     |  2 +
 .../grpo/config_demo.yaml                     |  6 +--
 src/open_r1/grpo.py                           |  2 +-
 src/open_r1/rewards.py                        | 38 +++++++++----------
 4 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 81793939..595b09b5 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -42,6 +42,8 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+- format
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
index c8433c1a..43252e4a 100644
--- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
@@ -8,8 +8,6 @@ attn_implementation: flash_attention_2
 dataset_name: open-r1/verifiable-coding-problems-python-1k
 dataset_configs:
 - all
-# Num processes is less by 1 as vLLM is using 1 GPU
-num_processes: 7
 
 # GRPO trainer config
 bf16: true
@@ -38,13 +36,13 @@ num_generations: 7
 num_train_epochs: 1
 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_train_batch_size: 1
+per_device_train_batch_size: 2
 push_to_hub: false #true
 report_to:
 - wandb
 reward_funcs:
+# - format
 - code
-- format
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 3d8de0dc..592434ae 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -27,12 +27,12 @@
 from open_r1.configs import GRPOConfig
 from open_r1.rewards import (
     accuracy_reward,
+    code_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
     len_reward,
     reasoning_steps_reward,
-    code_reward
 )
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index edb00ad5..00ef3841 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -4,9 +4,9 @@
 import re
 from typing import Dict
 
+from e2b_code_interpreter import Sandbox
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
-from e2b_code_interpreter import Sandbox
 
 
 def accuracy_reward(completions, solution, **kwargs):
@@ -274,16 +274,19 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
     return repetition_penalty_reward
 
 
-def extract_code(completion : str) -> str:
+def extract_code(completion: str) -> str:
     pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
     matches = pattern.findall(completion)
     extracted_answer = matches[-1] if len(matches) >= 1 else ""
     return extracted_answer
 
+
 import json
 
+
 def code_reward(completions, **kwargs):
     from dotenv import load_dotenv
+
     load_dotenv()
     from e2b_code_interpreter import Sandbox
 
@@ -322,29 +325,22 @@ def evaluate_code(code, test_cases):
     """
     code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
     verification_info = kwargs["verification_info"]
-    scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) for code, info in zip(code_snippets, verification_info)]
+    scripts = [
+        evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
+        for code, info in zip(code_snippets, verification_info)
+    ]
     rewards = []
     for script in scripts:
-        print(f"=== Script ===\n{script}\n=== End of Script ===")
+        # print(f"=== Script ===\n{script}\n=== End of Script ===")
 
-        execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data), request_timeout=3) # Execute Python inside the sandbox
+        execution = sbx.run_code(
+            script, on_stdout=lambda data: print("stdout:", data), request_timeout=3
+        )  # Execute Python inside the sandbox
 
-        print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
+        # print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
 
-        output = execution.text
-
-        # if len(execution.logs.stdout) > 0:
-        #     output += "\n".join(execution.logs.stdout)
-        # if len(execution.logs.stderr) > 0:
-        #     output += "\n".join(execution.logs.stderr)
-        # if execution.error is not None:
-        #     output += execution.error.traceback
-
-        # convert output to float
-        output = float(output)
+        output = float(execution.text)
         rewards.append(output)
-    
-    print(f"Rewards: {rewards}")
-    return rewards
-
 
+    # print(f"Rewards: {rewards}")
+    return rewards

From 78cf722790eea27443d95238289224a44b0b8311 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 16:17:04 +0000
Subject: [PATCH 04/33] Add fix for parse

---
 .../grpo/config_demo.yaml                     |   6 +-
 src/open_r1/rewards.py                        | 118 ++++++++++--------
 2 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
index 43252e4a..d1fe4db1 100644
--- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
@@ -36,13 +36,15 @@ num_generations: 7
 num_train_epochs: 1
 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_train_batch_size: 2
+per_device_train_batch_size: 1
 push_to_hub: false #true
 report_to:
 - wandb
 reward_funcs:
-# - format
+- format
 - code
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 00ef3841..9d9ac1df 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -1,14 +1,19 @@
 """Reward functions for GRPO training."""
 
+import json
 import math
 import re
 from typing import Dict
 
+from dotenv import load_dotenv
 from e2b_code_interpreter import Sandbox
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 
 
+load_dotenv()
+
+
 def accuracy_reward(completions, solution, **kwargs):
     """Reward function that checks if the completion is the same as the ground truth."""
     contents = [completion[0]["content"] for completion in completions]
@@ -281,66 +286,75 @@ def extract_code(completion: str) -> str:
     return extracted_answer
 
 
-import json
-
-
 def code_reward(completions, **kwargs):
-    from dotenv import load_dotenv
-
-    load_dotenv()
-    from e2b_code_interpreter import Sandbox
-
-    sbx = Sandbox()
-    """Returns a reward function that evaluates code snippets in a sandbox."""
-    evaluation_script_template = """
-    import subprocess
-    import json
-
-    def evaluate_code(code, test_cases):
-        passed = 0
-        total = len(test_cases)
-
-        for case in test_cases:
-            process = subprocess.run(
-                ["python3", "-c", code],
-                input=case["input"],
-                text=True,
-                capture_output=True
-            )
+    rewards = []
+    try:
+        sbx = Sandbox(timeout=30, request_timeout=3)
+        """Returns a reward function that evaluates code snippets in a sandbox."""
+        evaluation_script_template = """
+        import subprocess
+        import json
+
+        def evaluate_code(code, test_cases):
+            passed = 0
+            total = len(test_cases)
+
+            for case in test_cases:
+                process = subprocess.run(
+                    ["python3", "-c", code],
+                    input=case["input"],
+                    text=True,
+                    capture_output=True
+                )
 
-            if process.returncode != 0:  # Error in execution
-                continue
+                if process.returncode != 0:  # Error in execution
+                    continue
 
-            output = process.stdout.strip()
-            if output == case["output"]:
-                passed += 1
+                output = process.stdout.strip()
+                print("output")
+                print(output)
+                print()
+                print("case")
+                print(case["output"])
+                print()
+                print(output.strip() == case["output"].strip())
+                if output.strip() == case["output"].strip():
+                    passed += 1
 
-        success_rate = (passed / total)
-        return success_rate
+            success_rate = (passed / total)
+            return success_rate
 
-    code_snippet = {code}
-    test_cases = json.loads({test_cases})
+        code_snippet = {code}
+        test_cases = json.loads({test_cases})
 
-    evaluate_code(code_snippet, test_cases)
-    """
-    code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
-    verification_info = kwargs["verification_info"]
-    scripts = [
-        evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
-        for code, info in zip(code_snippets, verification_info)
-    ]
-    rewards = []
-    for script in scripts:
-        # print(f"=== Script ===\n{script}\n=== End of Script ===")
+        evaluate_code(code_snippet, test_cases)
+        """
+        code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
+        # gold_code_snippets = [extract_code(sol) for sol in kwargs["gold_standard_solution"]]
+        verification_info = kwargs["verification_info"]
+        scripts = [
+            evaluation_script_template.format(
+                code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))
+            )
+            for code, info in zip(code_snippets, verification_info)
+        ]
+        for script in scripts:
+            # print(f"=== Script ===\n{script}\n=== End of Script ===")
+
+            execution = sbx.run_code(
+                script, on_stdout=lambda data: print("stdout:", data)
+            )  # Execute Python inside the sandbox
 
-        execution = sbx.run_code(
-            script, on_stdout=lambda data: print("stdout:", data), request_timeout=3
-        )  # Execute Python inside the sandbox
+            # print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
 
-        # print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
+            output = float(execution.text)
+            rewards.append(output)
 
-        output = float(execution.text)
-        rewards.append(output)
+        # print(f"Rewards: {rewards}")
 
-    # print(f"Rewards: {rewards}")
+        # Shutdown to stay in limits
+        sbx.kill()
+    except Exception as e:
+        print(f"Error: {e}")
+        rewards = [0.0] * len(completions)
     return rewards

From 24dc34f9c8a60cb7bd4f4a9d3d8adc281085c29b Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 16:47:01 +0000
Subject: [PATCH 05/33] Fix

---
 .../grpo/config_demo.yaml                     |  4 +---
 src/open_r1/grpo.py                           |  2 +-
 src/open_r1/rewards.py                        | 23 +++++++------------
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
index d1fe4db1..1440a5b1 100644
--- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-1.5B-Instruct
+model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2
@@ -46,5 +46,3 @@ reward_funcs:
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
-wandb_entity: huggingface 
-wandb_project: open-r1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 592434ae..9ef7f1cc 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments):
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
-    "<think> reasoning process here </think><answer> answer here </answer>"
+    "<think> reasoning process here </think><answer> answer here </answer>. Make sure the Python code in the answer is enclosed with backticks."
 )
 
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 9d9ac1df..3b4c25f8 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -289,7 +289,6 @@ def extract_code(completion: str) -> str:
 def code_reward(completions, **kwargs):
     rewards = []
     try:
-        sbx = Sandbox(timeout=30, request_timeout=3)
         """Returns a reward function that evaluates code snippets in a sandbox."""
         evaluation_script_template = """
         import subprocess
@@ -338,22 +337,16 @@ def evaluate_code(code, test_cases):
             )
             for code, info in zip(code_snippets, verification_info)
         ]
-        for script in scripts:
-            # print(f"=== Script ===\n{script}\n=== End of Script ===")
+        with Sandbox(timeout=30, request_timeout=3) as sbx:
+            for script in scripts:
+                print("Running code in sandbox")
+                execution = sbx.run_code(script)
+                print("Execution completed")
 
-            execution = sbx.run_code(
-                script, on_stdout=lambda data: print("stdout:", data)
-            )  # Execute Python inside the sandbox
+                output = float(execution.text)
+                rewards.append(output)
 
-            # print(f"=== Execution ===\n{execution}\n=== End of Execution ===")
-
-            output = float(execution.text)
-            rewards.append(output)
-
-        # print(f"Rewards: {rewards}")
-
-        # Shutdown to stay in limits
-        sbx.kill()
+            # print(f"Rewards: {rewards}")
     except Exception as e:
         print(f"Error: {e}")
         rewards = [0.0] * len(completions)

From 22244fe73d6a18d090377d2482d2728a6c3f3f76 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 17:55:37 +0000
Subject: [PATCH 06/33] Refactor

---
 .../grpo/config_demo.yaml                           |  8 ++++----
 src/open_r1/grpo.py                                 |  2 +-
 src/open_r1/rewards.py                              | 13 +++++++++----
 3 files changed, 14 insertions(+), 9 deletions(-)
 rename recipes/{Qwen2.5-Coder-1.5B-Instruct => Qwen2.5-Coder-3B-Instruct}/grpo/config_demo.yaml (85%)

diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
similarity index 85%
rename from recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
rename to recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
index 1440a5b1..6962f3c3 100644
--- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
@@ -21,9 +21,9 @@ gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO
+hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO
 hub_strategy: every_save
-learning_rate: 1.0e-06
+learning_rate: 1.0e-05
 log_completions: true
 log_level: info
 logging_steps: 1
@@ -34,9 +34,9 @@ max_completion_length: 2048
 max_steps: -1
 num_generations: 7
 num_train_epochs: 1
-output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO
+output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_train_batch_size: 1
+per_device_train_batch_size: 2
 push_to_hub: false #true
 report_to:
 - wandb
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 9ef7f1cc..abe6d947 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments):
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
-    "<think> reasoning process here </think><answer> answer here </answer>. Make sure the Python code in the answer is enclosed with backticks."
+    "<think> reasoning process here </think><answer> ```python\n# ANSWER HERE\n``` </answer>."
 )
 
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 3b4c25f8..d82876bf 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -297,13 +297,15 @@ def code_reward(completions, **kwargs):
         def evaluate_code(code, test_cases):
             passed = 0
             total = len(test_cases)
+            exec_timeout = 5
 
             for case in test_cases:
                 process = subprocess.run(
                     ["python3", "-c", code],
                     input=case["input"],
                     text=True,
-                    capture_output=True
+                    capture_output=True,
+                    timeout=exec_timeout
                 )
 
                 if process.returncode != 0:  # Error in execution
@@ -338,12 +340,15 @@ def evaluate_code(code, test_cases):
             for code, info in zip(code_snippets, verification_info)
         ]
         with Sandbox(timeout=30, request_timeout=3) as sbx:
-            for script in scripts:
+            for code, script in zip(code_snippets, scripts):
                 print("Running code in sandbox")
-                execution = sbx.run_code(script)
+                execution = sbx.run_code(script, request_timeout=3)
                 print("Execution completed")
 
-                output = float(execution.text)
+                try:
+                    output = float(execution.text)
+                except (TypeError, ValueError):
+                    output = 0.0
                 rewards.append(output)
 
             # print(f"Rewards: {rewards}")

From c32d1377bf476d5423ff283ef114473fa88a09e0 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 21:15:36 +0000
Subject: [PATCH 07/33] Clean up

---
 .../grpo/config_demo.yaml                         |  9 ++++++---
 src/open_r1/rewards.py                            | 15 +--------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
index 6962f3c3..11d40fbd 100644
--- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
@@ -23,7 +23,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO
 hub_strategy: every_save
-learning_rate: 1.0e-05
+learning_rate: 1.0e-06
 log_completions: true
 log_level: info
 logging_steps: 1
@@ -32,17 +32,20 @@ lr_scheduler_type: cosine
 max_prompt_length: 1024
 max_completion_length: 2048
 max_steps: -1
-num_generations: 7
+num_generations: 14
 num_train_epochs: 1
 output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_train_batch_size: 2
+per_device_train_batch_size: 8
 push_to_hub: false #true
 report_to:
 - wandb
 reward_funcs:
 - format
 - code
+reward_weights:
+- 0.1
+- 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index d82876bf..149e162b 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -312,13 +312,6 @@ def evaluate_code(code, test_cases):
                     continue
 
                 output = process.stdout.strip()
-                print("output")
-                print(output)
-                print()
-                print("case")
-                print(case["output"])
-                print()
-                print(output.strip() == case["output"].strip())
                 if output.strip() == case["output"].strip():
                     passed += 1
 
@@ -331,7 +324,6 @@ def evaluate_code(code, test_cases):
         evaluate_code(code_snippet, test_cases)
         """
         code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
-        # gold_code_snippets = [extract_code(sol) for sol in kwargs["gold_standard_solution"]]
         verification_info = kwargs["verification_info"]
         scripts = [
             evaluation_script_template.format(
@@ -340,18 +332,13 @@ def evaluate_code(code, test_cases):
             for code, info in zip(code_snippets, verification_info)
         ]
         with Sandbox(timeout=30, request_timeout=3) as sbx:
-            for code, script in zip(code_snippets, scripts):
-                print("Running code in sandbox")
+            for script in scripts:
                 execution = sbx.run_code(script, request_timeout=3)
-                print("Execution completed")
-
                 try:
                     output = float(execution.text)
                 except (TypeError, ValueError):
                     output = 0.0
                 rewards.append(output)
-
-            # print(f"Rewards: {rewards}")
     except Exception as e:
         print(f"Error: {e}")
         rewards = [0.0] * len(completions)

From dab15e0567db2fc552a62f2b486c13e6d0e0b625 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 21:41:19 +0000
Subject: [PATCH 08/33] Fix config

---
 ...ig_demo.yaml => config_codeforces_1k.yaml} | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)
 rename recipes/Qwen2.5-Coder-3B-Instruct/grpo/{config_demo.yaml => config_codeforces_1k.yaml} (71%)

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
similarity index 71%
rename from recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
rename to recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
index 11d40fbd..1e33be08 100644
--- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
@@ -10,42 +10,46 @@ dataset_configs:
 - all
 
 # GRPO trainer config
+benchmarks:
+- gpqa
 bf16: true
-use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
-do_eval: true
-eval_strategy: 'no' #steps
-eval_steps: 100
+callbacks:
+- push_to_hub_revision
+eval_strategy: 'no'
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO
+hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: true
 log_level: info
 logging_steps: 1
 logging_strategy: steps
-lr_scheduler_type: cosine
+lr_scheduler_type: constant
 max_prompt_length: 1024
 max_completion_length: 2048
 max_steps: -1
 num_generations: 14
 num_train_epochs: 1
-output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO
+output_dir: data/Qwen2.5-Coder-3B-GRPO
 overwrite_output_dir: true
-per_device_train_batch_size: 8
-push_to_hub: false #true
+per_device_train_batch_size: 16
+push_to_hub: true
 report_to:
 - wandb
 reward_funcs:
 - format
 - code
 reward_weights:
-- 0.1
+- 0.25
 - 1.0
-save_strategy: "no"
+save_strategy: "steps"
+save_steps: 10
+save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+warmup_ratio: 0.0

From edc502d00e4210a728458a2fab15244f80a58b6b Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 14 Feb 2025 09:12:22 +0000
Subject: [PATCH 09/33] Fix sys

---
 src/open_r1/grpo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index abe6d947..592434ae 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments):
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
-    "<think> reasoning process here </think><answer> ```python\n# ANSWER HERE\n``` </answer>."
+    "<think> reasoning process here </think><answer> answer here </answer>"
 )
 
 

From 27af68edce3b36fcd722a22dc7968240784a0a05 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sat, 15 Feb 2025 21:37:02 +0000
Subject: [PATCH 10/33] Add SFT config

---
 .../sft/config_demo.yaml                      |  2 +-
 .../sft/config_openthoughts_code.yaml         | 49 +++++++++++++++++++
 slurm/train.slurm                             |  8 +--
 src/open_r1/sft.py                            |  2 +
 4 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml

diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index c7dd25bb..fb5830b9 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -8,7 +8,7 @@ attn_implementation: flash_attention_2
 dataset_name: HuggingFaceH4/Bespoke-Stratos-17k
 dataset_configs:
 - all
-preprocessing_num_workers: 8
+dataset_num_proc: 8
 
 # SFT trainer config
 bf16: true
diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
new file mode 100644
index 00000000..ec5e342b
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
@@ -0,0 +1,49 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenThoughts-114k-code
+dataset_configs:
+- all
+dataset_num_proc: 48
+
+# SFT trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- gpqa
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+packing: true
+max_seq_length: 32768
+max_steps: -1
+num_train_epochs: 3
+output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "epoch"
+save_total_limit: 1
+seed: 42
+use_liger: true
+wandb_entity: huggingface
+wandb_project: open-r1
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/slurm/train.slurm b/slurm/train.slurm
index c10a2a23..c212afed 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,11 +32,11 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
-USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
+# USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
 
-if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
-    WORLD_SIZE=$(($WORLD_SIZE-1))
-fi
+# if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
+#     WORLD_SIZE=$(($WORLD_SIZE-1))
+# fi
 
 # Split the string into individual arguments
 IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index b6031d81..91bf74d3 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -67,6 +67,8 @@ def main(script_args, training_args, model_args):
     # Set seed for reproducibility
     set_seed(training_args.seed)
 
+    training_args.lr_scheduler_kwargs = {"min_lr": training_args.learning_rate * 0.1}
+
     ###############
     # Setup logging
     ###############

From 53eaddb639c3dc7d08e3cf9f45a18bec63427eba Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sat, 15 Feb 2025 21:52:50 +0000
Subject: [PATCH 11/33] Use min rate

---
 .../Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml | 2 ++
 src/open_r1/sft.py                                              | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
index ec5e342b..3a6b7583 100644
--- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
@@ -29,6 +29,8 @@ log_level: info
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
 packing: true
 max_seq_length: 32768
 max_steps: -1
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 91bf74d3..b6031d81 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -67,8 +67,6 @@ def main(script_args, training_args, model_args):
     # Set seed for reproducibility
     set_seed(training_args.seed)
 
-    training_args.lr_scheduler_kwargs = {"min_lr": training_args.learning_rate * 0.1}
-
     ###############
     # Setup logging
     ###############

From 385d79988018a072b265cd53693610f97e1d277c Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 16 Feb 2025 07:30:51 +0000
Subject: [PATCH 12/33] Fix eval

---
 .../Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
index 3a6b7583..47d23bd4 100644
--- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
@@ -37,7 +37,7 @@ max_steps: -1
 num_train_epochs: 3
 output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
+per_device_eval_batch_size: 1
 per_device_train_batch_size: 4
 push_to_hub: true
 report_to:

From 52fc68198579748b63b16f1ce8733b271f1d2069 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 16 Feb 2025 21:25:50 +0000
Subject: [PATCH 13/33] Add base model

---
 .../grpo/config_codeforces_1k.yaml            | 55 +++++++++++++++++++
 .../sft/config_openthoughts_code.yaml         | 51 +++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
 create mode 100644 recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml

diff --git a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
new file mode 100644
index 00000000..1e33be08
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
@@ -0,0 +1,55 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python-1k
+dataset_configs:
+- all
+
+# GRPO trainer config
+benchmarks:
+- gpqa
+bf16: true
+callbacks:
+- push_to_hub_revision
+eval_strategy: 'no'
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant
+max_prompt_length: 1024
+max_completion_length: 2048
+max_steps: -1
+num_generations: 14
+num_train_epochs: 1
+output_dir: data/Qwen2.5-Coder-3B-GRPO
+overwrite_output_dir: true
+per_device_train_batch_size: 16
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- format
+- code
+reward_weights:
+- 0.25
+- 1.0
+save_strategy: "steps"
+save_steps: 10
+save_total_limit: 1
+seed: 42
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+warmup_ratio: 0.0
diff --git a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml
new file mode 100644
index 00000000..33e37136
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml
@@ -0,0 +1,51 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-3B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenThoughts-114k-code
+dataset_configs:
+- all
+dataset_num_proc: 48
+
+# SFT trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- gpqa
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+packing: true
+max_seq_length: 32768
+max_steps: -1
+num_train_epochs: 3
+output_dir: data/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "epoch"
+save_total_limit: 1
+seed: 42
+use_liger: true
+wandb_entity: huggingface
+wandb_project: open-r1
+warmup_ratio: 0.03
\ No newline at end of file

From 884387f1a04efe7ad228c3190a341f5a5f5b05a3 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Sun, 16 Feb 2025 22:13:54 +0000
Subject: [PATCH 14/33] Add s1k

---
 .../sft/config_s1k.yaml                       | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
new file mode 100644
index 00000000..516a0c59
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
@@ -0,0 +1,52 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/s1K-1.1
+dataset_configs:
+- all
+dataset_num_proc: 48
+
+# SFT trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- gpqa
+- math_500
+bf16: true
+do_eval: false
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+packing: true
+max_seq_length: 32768
+max_steps: -1
+num_train_epochs: 15
+output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 2
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "epoch"
+save_total_limit: 1
+seed: 42
+use_liger: true
+wandb_entity: huggingface
+wandb_project: open-r1
+warmup_ratio: 0.05
\ No newline at end of file

From 2d3c79794fad8716a252a5b8f16a9a3d53603f98 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 17 Feb 2025 11:07:57 +0000
Subject: [PATCH 15/33] Disable eval

---
 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
index 516a0c59..ccfb2139 100644
--- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
+++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
@@ -18,7 +18,7 @@ benchmarks:
 - math_500
 bf16: true
 do_eval: false
-eval_strategy: epoch
+eval_strategy: 'no'
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:

From aaa8f6f7c994b484f1a4b90ebc8be21f3cbc562e Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 14:57:06 +0000
Subject: [PATCH 16/33] Fix

---
 .../grpo/config_code_demo.yaml                | 60 +++++++++++++++++++
 src/open_r1/rewards.py                        |  6 +-
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml
new file mode 100644
index 00000000..9b26bb2a
--- /dev/null
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml
@@ -0,0 +1,60 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python-10k
+dataset_configs:
+- default
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.9
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_prompt_length: 1024
+max_completion_length: 7168
+max_steps: 1000
+num_generations: 14
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 10
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.03
+wandb_entity: huggingface 
+wandb_project: open-r1
\ No newline at end of file
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 0e39d9d2..a0eb2b56 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -287,6 +287,10 @@ def extract_code(completion: str) -> str:
 
 
 def code_reward(completions, **kwargs):
+    """Reward function that evaluates code snippets using the E2B code interpreter.
+
+    Assumes the dataset contains a `verification_info` column with test cases.
+    """
     rewards = []
     try:
         """Returns a reward function that evaluates code snippets in a sandbox."""
@@ -340,6 +344,6 @@ def evaluate_code(code, test_cases):
                     output = 0.0
                 rewards.append(output)
     except Exception as e:
-        print(f"Error: {e}")
+        print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
     return rewards

From 20a1ea0690b4d83b4b7468bcc63827455e9b8c4a Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 15:13:45 +0000
Subject: [PATCH 17/33] Add import checker

---
 ...g_code_demo.yaml => config_demo_code.yaml} |  6 +++--
 src/open_r1/rewards.py                        | 17 ++++++++++----
 src/open_r1/utils/__init__.py                 |  3 ++-
 src/open_r1/utils/import_utils.py             | 23 +++++++++++++++++++
 4 files changed, 41 insertions(+), 8 deletions(-)
 rename recipes/Qwen2.5-1.5B-Instruct/grpo/{config_code_demo.yaml => config_demo_code.yaml} (93%)
 create mode 100644 src/open_r1/utils/import_utils.py

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
similarity index 93%
rename from recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml
rename to recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 9b26bb2a..3f85c66a 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -11,13 +11,15 @@ dataset_configs:
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
+callbacks:
+- push_to_hub_revision
 beta: 0.001
 bf16: true
 use_vllm: true
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.9
 do_eval: false
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -40,7 +42,7 @@ num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
 overwrite_output_dir: true
 per_device_eval_batch_size: 4
-per_device_train_batch_size: 8
+per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index a0eb2b56..49de06e2 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -5,13 +5,14 @@
 import re
 from typing import Dict
 
-from dotenv import load_dotenv
-from e2b_code_interpreter import Sandbox
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
+from .utils import is_e2b_available
 
-
-load_dotenv()
+if is_e2b_available():
+    from dotenv import load_dotenv
+    from e2b_code_interpreter import Sandbox
+    load_dotenv()
 
 
 def accuracy_reward(completions, solution, **kwargs):
@@ -337,13 +338,19 @@ def evaluate_code(code, test_cases):
         ]
         with Sandbox(timeout=30, request_timeout=3) as sbx:
             for script in scripts:
-                execution = sbx.run_code(script, request_timeout=3)
+                print("Running script")
+                execution = sbx.run_code(script)
+                print("Script run")
                 try:
                     output = float(execution.text)
                 except (TypeError, ValueError):
                     output = 0.0
+
+                print(f"Output: {output}")
                 rewards.append(output)
     except Exception as e:
         print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
+    
+    print("Rewards finished!")
     return rewards
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index b1de213d..da3ec481 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,4 +1,5 @@
 from .model_utils import get_tokenizer
+from .import_utils import is_e2b_available
 
 
-__all__ = ["get_tokenizer"]
+__all__ = ["get_tokenizer", "is_e2b_available"]
\ No newline at end of file
diff --git a/src/open_r1/utils/import_utils.py b/src/open_r1/utils/import_utils.py
new file mode 100644
index 00000000..8893264a
--- /dev/null
+++ b/src/open_r1/utils/import_utils.py
@@ -0,0 +1,23 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.utils.import_utils import _is_package_available
+
+
+# Use same as transformers.utils.import_utils
+_e2b_available = _is_package_available("e2b")
+
+
+def is_e2b_available() -> bool:
+    return _e2b_available

From 58633036fa78f2669f5e6c531b0c76f07548cc26 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 15:41:49 +0000
Subject: [PATCH 18/33] Fix importer

---
 .../Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml  |  7 +++----
 src/open_r1/rewards.py                                | 11 ++++-------
 src/open_r1/utils/__init__.py                         |  4 ++--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 3f85c66a..cb31a62f 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -19,11 +19,11 @@ use_vllm: true
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.9
 do_eval: false
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_model_id: lewtun/Qwen2.5-1.5B-Open-R1-Code-GRPO
 hub_strategy: every_save
 learning_rate: 5.0e-06
 log_completions: true
@@ -35,13 +35,12 @@ lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
 max_prompt_length: 1024
-max_completion_length: 7168
+max_completion_length: 2048
 max_steps: 1000
 num_generations: 14
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
 per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 49de06e2..7b99037f 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -7,11 +7,14 @@
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
+
 from .utils import is_e2b_available
 
+
 if is_e2b_available():
     from dotenv import load_dotenv
     from e2b_code_interpreter import Sandbox
+
     load_dotenv()
 
 
@@ -287,7 +290,7 @@ def extract_code(completion: str) -> str:
     return extracted_answer
 
 
-def code_reward(completions, **kwargs):
+def code_reward(completions, **kwargs) -> list[float]:
     """Reward function that evaluates code snippets using the E2B code interpreter.
 
     Assumes the dataset contains a `verification_info` column with test cases.
@@ -338,19 +341,13 @@ def evaluate_code(code, test_cases):
         ]
         with Sandbox(timeout=30, request_timeout=3) as sbx:
             for script in scripts:
-                print("Running script")
                 execution = sbx.run_code(script)
-                print("Script run")
                 try:
                     output = float(execution.text)
                 except (TypeError, ValueError):
                     output = 0.0
-
-                print(f"Output: {output}")
                 rewards.append(output)
     except Exception as e:
         print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
-    
-    print("Rewards finished!")
     return rewards
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index da3ec481..5302463e 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,5 +1,5 @@
-from .model_utils import get_tokenizer
 from .import_utils import is_e2b_available
+from .model_utils import get_tokenizer
 
 
-__all__ = ["get_tokenizer", "is_e2b_available"]
\ No newline at end of file
+__all__ = ["get_tokenizer", "is_e2b_available"]

From 8d78b8efdbb47ad2fd00ae38c00157b9492dc10e Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:31:10 +0000
Subject: [PATCH 19/33] Fix

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index cb31a62f..0c737f24 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -36,7 +36,7 @@ lr_scheduler_kwargs:
   min_lr_rate: 0.1
 max_prompt_length: 1024
 max_completion_length: 2048
-max_steps: 1000
+max_steps: 500
 num_generations: 14
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO

From 932e69e82a5f0ce735145444d02c6d0e1142b7b6 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:31:37 +0000
Subject: [PATCH 20/33] Tune config

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 0c737f24..f379c112 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -34,7 +34,7 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-max_prompt_length: 1024
+max_prompt_length: 512
 max_completion_length: 2048
 max_steps: 500
 num_generations: 14

From 258406f771d221d6ac59d0fcedb7827ed39996e2 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:31:58 +0000
Subject: [PATCH 21/33] Tune

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index f379c112..0c737f24 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -34,7 +34,7 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-max_prompt_length: 512
+max_prompt_length: 1024
 max_completion_length: 2048
 max_steps: 500
 num_generations: 14

From fd9860e3d4c2a15f94dc78df076420f0c8dcf566 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:32:35 +0000
Subject: [PATCH 22/33] Fix

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 0c737f24..202ae86a 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -23,9 +23,9 @@ gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: lewtun/Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_model_id: open-r1/Qwen2.5-1.5B-Open-R1-Code-GRPO
 hub_strategy: every_save
-learning_rate: 5.0e-06
+learning_rate: 5.0e-07
 log_completions: true
 log_level: info
 logging_first_step: true

From c614dbd0b5fddee55260e8c696ebfc794cc4e471 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:36:16 +0000
Subject: [PATCH 23/33] Fix save

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 202ae86a..6e3ca4ff 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -52,7 +52,7 @@ reward_weights:
 - 1.0
 - 0.1
 save_strategy: "steps"
-save_steps: 10
+save_steps: 50
 save_total_limit: 1
 seed: 42
 temperature: 1.0

From 51815b2eb6b3179b4ab61bdd1fb4ae6893a46b52 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:38:45 +0000
Subject: [PATCH 24/33] Tuen beta

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 6e3ca4ff..46ee165a 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -13,7 +13,7 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d
 # GRPO trainer config
 callbacks:
 - push_to_hub_revision
-beta: 0.001
+beta: 0.01
 bf16: true
 use_vllm: true
 vllm_device: auto

From da0840799bfea50c841f39f135f65b38f33bb383 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:43:20 +0000
Subject: [PATCH 25/33] Remove configs

---
 .../grpo/config_codeforces_1k.yaml            | 55 -------------------
 .../sft/config_openthoughts_code.yaml         | 51 -----------------
 .../sft/config_s1k.yaml                       | 52 ------------------
 .../grpo/config_codeforces_1k.yaml            | 55 -------------------
 .../sft/config_openthoughts_code.yaml         | 51 -----------------
 5 files changed, 264 deletions(-)
 delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
 delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
 delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
 delete mode 100644 recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
 delete mode 100644 recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml

diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
deleted file mode 100644
index 1e33be08..00000000
--- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/verifiable-coding-problems-python-1k
-dataset_configs:
-- all
-
-# GRPO trainer config
-benchmarks:
-- gpqa
-bf16: true
-callbacks:
-- push_to_hub_revision
-eval_strategy: 'no'
-gradient_accumulation_steps: 1
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO
-hub_strategy: every_save
-learning_rate: 1.0e-06
-log_completions: true
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: constant
-max_prompt_length: 1024
-max_completion_length: 2048
-max_steps: -1
-num_generations: 14
-num_train_epochs: 1
-output_dir: data/Qwen2.5-Coder-3B-GRPO
-overwrite_output_dir: true
-per_device_train_batch_size: 16
-push_to_hub: true
-report_to:
-- wandb
-reward_funcs:
-- format
-- code
-reward_weights:
-- 0.25
-- 1.0
-save_strategy: "steps"
-save_steps: 10
-save_total_limit: 1
-seed: 42
-use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
-warmup_ratio: 0.0
diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
deleted file mode 100644
index 47d23bd4..00000000
--- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/OpenThoughts-114k-code
-dataset_configs:
-- all
-dataset_num_proc: 48
-
-# SFT trainer config
-callbacks:
-- push_to_hub_revision
-benchmarks:
-- gpqa
-bf16: true
-do_eval: true
-eval_strategy: epoch
-gradient_accumulation_steps: 4
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00
-hub_strategy: every_save
-learning_rate: 5.0e-07
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: cosine_with_min_lr
-lr_scheduler_kwargs:
-  min_lr_rate: 0.1
-packing: true
-max_seq_length: 32768
-max_steps: -1
-num_train_epochs: 3
-output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00
-overwrite_output_dir: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 4
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "epoch"
-save_total_limit: 1
-seed: 42
-use_liger: true
-wandb_entity: huggingface
-wandb_project: open-r1
-warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
deleted file mode 100644
index ccfb2139..00000000
--- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/s1K-1.1
-dataset_configs:
-- all
-dataset_num_proc: 48
-
-# SFT trainer config
-callbacks:
-- push_to_hub_revision
-benchmarks:
-- gpqa
-- math_500
-bf16: true
-do_eval: false
-eval_strategy: 'no'
-gradient_accumulation_steps: 1
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00
-hub_strategy: every_save
-learning_rate: 1.0e-06
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: cosine_with_min_lr
-lr_scheduler_kwargs:
-  min_lr_rate: 0.1
-packing: true
-max_seq_length: 32768
-max_steps: -1
-num_train_epochs: 15
-output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00
-overwrite_output_dir: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 2
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "epoch"
-save_total_limit: 1
-seed: 42
-use_liger: true
-wandb_entity: huggingface
-wandb_project: open-r1
-warmup_ratio: 0.05
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
deleted file mode 100644
index 1e33be08..00000000
--- a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/verifiable-coding-problems-python-1k
-dataset_configs:
-- all
-
-# GRPO trainer config
-benchmarks:
-- gpqa
-bf16: true
-callbacks:
-- push_to_hub_revision
-eval_strategy: 'no'
-gradient_accumulation_steps: 1
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO
-hub_strategy: every_save
-learning_rate: 1.0e-06
-log_completions: true
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: constant
-max_prompt_length: 1024
-max_completion_length: 2048
-max_steps: -1
-num_generations: 14
-num_train_epochs: 1
-output_dir: data/Qwen2.5-Coder-3B-GRPO
-overwrite_output_dir: true
-per_device_train_batch_size: 16
-push_to_hub: true
-report_to:
-- wandb
-reward_funcs:
-- format
-- code
-reward_weights:
-- 0.25
-- 1.0
-save_strategy: "steps"
-save_steps: 10
-save_total_limit: 1
-seed: 42
-use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
-warmup_ratio: 0.0
diff --git a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml
deleted file mode 100644
index 33e37136..00000000
--- a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Coder-3B
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/OpenThoughts-114k-code
-dataset_configs:
-- all
-dataset_num_proc: 48
-
-# SFT trainer config
-callbacks:
-- push_to_hub_revision
-benchmarks:
-- gpqa
-bf16: true
-do_eval: true
-eval_strategy: epoch
-gradient_accumulation_steps: 4
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00
-hub_strategy: every_save
-learning_rate: 5.0e-07
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: cosine_with_min_lr
-lr_scheduler_kwargs:
-  min_lr_rate: 0.1
-packing: true
-max_seq_length: 32768
-max_steps: -1
-num_train_epochs: 3
-output_dir: data/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00
-overwrite_output_dir: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 4
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "epoch"
-save_total_limit: 1
-seed: 42
-use_liger: true
-wandb_entity: huggingface
-wandb_project: open-r1
-warmup_ratio: 0.03
\ No newline at end of file

From 5f35a61b64f7de1ddfa5514d398aebfa548ac572 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:46:36 +0000
Subject: [PATCH 26/33] Fix vLLM

---
 slurm/train.slurm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/slurm/train.slurm b/slurm/train.slurm
index c212afed..c10a2a23 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,11 +32,11 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
-# USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
+USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
 
-# if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
-#     WORLD_SIZE=$(($WORLD_SIZE-1))
-# fi
+if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
+    WORLD_SIZE=$(($WORLD_SIZE-1))
+fi
 
 # Split the string into individual arguments
 IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"

From 93254b46c16d25944c4ae2bc8e9b52cd5cf5f52e Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 16:50:20 +0000
Subject: [PATCH 27/33] Fix

---
 setup.py               | 3 +++
 src/open_r1/rewards.py | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/setup.py b/setup.py
index 231de49a..907269c2 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
     "datasets>=3.2.0",
     "deepspeed==0.15.4",
     "distilabel[vllm,ray,openai]>=1.5.2",
+    "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",
     "flake8>=6.0.0",
     "flash_attn>=2.7.4.post1",
@@ -60,6 +61,7 @@
     "parameterized>=0.9.0",
     "peft>=0.14.0",
     "pytest",
+    "python-dotenv",
     "ruff>=0.9.0",
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
@@ -88,6 +90,7 @@ def deps_list(*pkgs):
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["train"] = deps_list("flash_attn")
+extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 7b99037f..d9beeb60 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -295,6 +295,12 @@ def code_reward(completions, **kwargs) -> list[float]:
 
     Assumes the dataset contains a `verification_info` column with test cases.
     """
+    if not is_e2b_available():
+        raise ImportError(
+            "E2B is not available and required for this reward function. Please install E2B with "
+            "`pip install e2b-code-interpreter` and add an API key to a `.env` file."
+        )
+
     rewards = []
     try:
         """Returns a reward function that evaluates code snippets in a sandbox."""

From 853e42b875de293fc52d08da3c9ef507923b32c6 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 22:34:28 +0000
Subject: [PATCH 28/33] Add note

---
 src/open_r1/rewards.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index d9beeb60..4440335e 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -302,6 +302,7 @@ def code_reward(completions, **kwargs) -> list[float]:
         )
 
     rewards = []
+    # TODO: add support for other languages in E2B: https://e2b.dev/docs/code-interpreting/supported-languages
     try:
         """Returns a reward function that evaluates code snippets in a sandbox."""
         evaluation_script_template = """

From 23dfafdb151b4a60d4c99c35ca8a10649ce2f44f Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 09:33:34 +0000
Subject: [PATCH 29/33] Add doc

---
 README.md                                     | 29 +++++++++++++++++++
 .../grpo/config_demo_code.yaml                |  6 ++--
 src/open_r1/rewards.py                        |  2 +-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a14f180f..40fd8c96 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,35 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
 
+#### Training with a code interpreter
+
+We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies:
+
+```shell
+uv pip install -e '.[code]
+```
+
+Then create a `.env` file and place an API token from E2B within it:
+
+```
+E2B_API_KEY="e2b_xxx"
+```
+
+Then make sure your dataset contains a `verification_info` column with the following schema (adopted from PrimeIntellect's excellent [datasets](https://huggingface.co/collections/PrimeIntellect/synthetic-1-67a2c399cfdd6c9f7fae0c37) of verifiable problems):
+
+```python
+{
+    "language": "python",
+    "test_cases": [
+        {
+            "input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n",
+            "output": "1\n3 \n-1\n0\n\n2\n1 2 \n",
+            "type": "stdin_stdout",
+        }
+    ],
+}
+```
+
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 46ee165a..ed3e9daf 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -23,7 +23,7 @@ gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: open-r1/Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
 hub_strategy: every_save
 learning_rate: 5.0e-07
 log_completions: true
@@ -56,6 +56,4 @@ save_steps: 50
 save_total_limit: 1
 seed: 42
 temperature: 1.0
-warmup_ratio: 0.03
-wandb_entity: huggingface 
-wandb_project: open-r1
\ No newline at end of file
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 4440335e..c003f4d3 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -348,7 +348,7 @@ def evaluate_code(code, test_cases):
         ]
         with Sandbox(timeout=30, request_timeout=3) as sbx:
             for script in scripts:
-                execution = sbx.run_code(script)
+                execution = sbx.run_code(script, language=verification_info["language"])
                 try:
                     output = float(execution.text)
                 except (TypeError, ValueError):

From 65c44d89700029049a03602b244939cc8b5dd7b8 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 09:50:51 +0000
Subject: [PATCH 30/33] doc

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 40fd8c96..5f82d72e 100644
--- a/README.md
+++ b/README.md
@@ -170,12 +170,12 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
 
-#### Training with a code interpreter
+#### 👨‍💻 Training with a code interpreter
 
 We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies:
 
 ```shell
-uv pip install -e '.[code]
+uv pip install -e '.[code]'
 ```
 
 Then create a `.env` file and place an API token from E2B within it:

From 04381cab1c2862c0ca45998c3ff8a732b4fa2a79 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 09:51:30 +0000
Subject: [PATCH 31/33] Fix

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index ed3e9daf..b82cd8f2 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -11,8 +11,6 @@ dataset_configs:
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
-callbacks:
-- push_to_hub_revision
 beta: 0.01
 bf16: true
 use_vllm: true

From fb6e4ae8b37106a097b6e51acd42bf6428499a15 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 09:52:58 +0000
Subject: [PATCH 32/33] Tune lr

---
 README.md                                                | 2 ++
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5f82d72e..6201c22d 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,8 @@ Then make sure your dataset contains a `verification_info` column with the follo
 }
 ```
 
+See the [training config](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml) for an example with `Qwen2.5-1.5B-Instruct`.
+
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index b82cd8f2..783a4d2a 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -23,7 +23,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
 hub_strategy: every_save
-learning_rate: 5.0e-07
+learning_rate: 5.0e-06
 log_completions: true
 log_level: info
 logging_first_step: true

From 89ded4353c51a15ad1b1b65a373fd42d2874f1ac Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 10:03:01 +0000
Subject: [PATCH 33/33] Add command

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6201c22d..1111231a 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,13 @@ Then make sure your dataset contains a `verification_info` column with the follo
 }
 ```
 
-See the [training config](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml) for an example with `Qwen2.5-1.5B-Instruct`.
+For example, to train a smol model on Python problems, run:
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
+    --num_processes=7 src/open_r1/grpo.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+```
 
 ### Launching jobs on a Slurm cluster