From e0decfde3b37fd48b2175648b97754750a48cd38 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 10:53:28 +0000 Subject: [PATCH 01/33] Add stuff --- .gitignore | 4 +- .../grpo/config_demo.yaml | 50 ++++++++++++++ src/open_r1/grpo.py | 2 + src/open_r1/rewards.py | 66 +++++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml diff --git a/.gitignore b/.gitignore index d44c47f6..f4db7819 100644 --- a/.gitignore +++ b/.gitignore @@ -175,4 +175,6 @@ data/ wandb/ logs/ eval_results/ -results/ \ No newline at end of file +results/ + +.vscode/ \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml new file mode 100644 index 00000000..19f4d6f5 --- /dev/null +++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml @@ -0,0 +1,50 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-1.5B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: AI-MO/NuminaMath-TIR +dataset_configs: +- all +# Num processes is less by 1 as vLLM is using 1 GPU +num_processes: 7 + +# GRPO trainer config +bf16: true +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: true +eval_strategy: steps +eval_steps: 100 +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO +hub_strategy: every_save +learning_rate: 2.0e-05 +log_completions: true +log_level: info +logging_steps: 5 +logging_strategy: steps +lr_scheduler_type: cosine +max_prompt_length: 512 +max_completion_length: 1024 +max_steps: -1 +num_generations: 7 +num_train_epochs: 1 +output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO +overwrite_output_dir: true +per_device_eval_batch_size: 32 +per_device_train_batch_size: 16 +push_to_hub: false #true +report_to: +- wandb +reward_funcs: +- code_reward +save_strategy: "no" +seed: 42 +warmup_ratio: 0.1 diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 128375db..f7bd036e 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -31,6 +31,7 @@ get_cosine_scaled_reward, get_repetition_penalty_reward, reasoning_steps_reward, + code_reward ) from open_r1.utils.callbacks import get_callbacks from open_r1.utils.logging import init_wandb_training @@ -162,6 +163,7 @@ def main(script_args, training_args, model_args): ngram_size=script_args.repetition_n_grams, max_penalty=script_args.repetition_max_penalty, ), + "code_reward": code_reward, } reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs] diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index bec3d11c..19d18442 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -5,6 +5,7 @@ from latex2sympy2_extended import NormalizationConfig from math_verify import LatexExtractionConfig, parse, verify +from e2b_code_interpreter import Sandbox def accuracy_reward(completions, solution, **kwargs): @@ -197,3 +198,68 @@ def repetition_penalty_reward(completions, **kwargs) -> float: return rewards return repetition_penalty_reward + + +def extract_code(completion : str) -> str: + pattern = re.compile(r"```python\n(.*?)```", re.DOTALL) + matches = pattern.findall(completion) + extracted_answer = matches[-1] if len(matches) >= 1 else "" + return extracted_answer + +import json + +def code_reward(completions, **kwargs): + from e2b_code_interpreter import Sandbox + + sbx = Sandbox() + """Returns a reward function that evaluates code snippets in a sandbox.""" + evaluation_script_template = """import subprocess + import json + + def evaluate_code(code, test_cases): + passed = 0 + total = len(test_cases) + + for case in test_cases: + process = subprocess.run( + ["python3", "-c", code], + input=case["input"], + text=True, + capture_output=True + ) + + if process.returncode != 0: # Error in execution + continue + + output = process.stdout.strip() + if output == case["output"]: + passed += 1 + + success_rate = (passed / total) + + code_snippet = {code} + test_cases = json.loads({test_cases}) + + evaluate_code(code_snippet, test_cases) + """ + code_snippets = [extract_code(completion) for completion in completions] + test_cases = kwargs["verification_info"]["test_cases"] + scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(test_cases))) for code in code_snippets] + rewards = [] + for script in scripts: + execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data)) # Execute Python inside the sandbox + + output = "" + if len(execution.logs.stdout) > 0: + output += "\n".join(execution.logs.stdout) + if len(execution.logs.stderr) > 0: + output += "\n".join(execution.logs.stderr) + if execution.error is not None: + output += execution.error.traceback + + # convert output to float + output = float(output) + rewards.append(output) + return rewards + + From da197834db0b1502df4baa5e5952e4a3e7daae36 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 12:14:41 +0000 Subject: [PATCH 02/33] Make it kind of work --- .../grpo/config_demo.yaml | 20 +++++------ src/open_r1/grpo.py | 2 +- src/open_r1/rewards.py | 35 ++++++++++++------- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml index 19f4d6f5..c8433c1a 100644 --- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml @@ -5,7 +5,7 @@ torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments -dataset_name: AI-MO/NuminaMath-TIR +dataset_name: open-r1/verifiable-coding-problems-python-1k dataset_configs: - all # Num processes is less by 1 as vLLM is using 1 GPU @@ -17,34 +17,34 @@ use_vllm: true vllm_device: auto vllm_gpu_memory_utilization: 0.7 do_eval: true -eval_strategy: steps +eval_strategy: 'no' #steps eval_steps: 100 -gradient_accumulation_steps: 16 +gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO hub_strategy: every_save -learning_rate: 2.0e-05 +learning_rate: 1.0e-06 log_completions: true log_level: info -logging_steps: 5 +logging_steps: 1 logging_strategy: steps lr_scheduler_type: cosine -max_prompt_length: 512 -max_completion_length: 1024 +max_prompt_length: 1024 +max_completion_length: 2048 max_steps: -1 num_generations: 7 num_train_epochs: 1 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO overwrite_output_dir: true -per_device_eval_batch_size: 32 -per_device_train_batch_size: 16 +per_device_train_batch_size: 1 push_to_hub: false #true report_to: - wandb reward_funcs: -- code_reward +- code +- format save_strategy: "no" seed: 42 warmup_ratio: 0.1 diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 30e60832..3d8de0dc 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -165,7 +165,7 @@ def main(script_args, training_args, model_args): max_penalty=script_args.repetition_max_penalty, ), "length": len_reward, - "code_reward": code_reward, + "code": code_reward, } reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs] diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 639cde5b..edb00ad5 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -283,11 +283,14 @@ def extract_code(completion : str) -> str: import json def code_reward(completions, **kwargs): + from dotenv import load_dotenv + load_dotenv() from e2b_code_interpreter import Sandbox sbx = Sandbox() """Returns a reward function that evaluates code snippets in a sandbox.""" - evaluation_script_template = """import subprocess + evaluation_script_template = """ + import subprocess import json def evaluate_code(code, test_cases): @@ -310,30 +313,38 @@ def evaluate_code(code, test_cases): passed += 1 success_rate = (passed / total) + return success_rate code_snippet = {code} test_cases = json.loads({test_cases}) evaluate_code(code_snippet, test_cases) """ - code_snippets = [extract_code(completion) for completion in completions] - test_cases = kwargs["verification_info"]["test_cases"] - scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(test_cases))) for code in code_snippets] + code_snippets = [extract_code(completion[-1]["content"]) for completion in completions] + verification_info = kwargs["verification_info"] + scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) for code, info in zip(code_snippets, verification_info)] rewards = [] for script in scripts: - execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data)) # Execute Python inside the sandbox + print(f"=== Script ===\n{script}\n=== End of Script ===") - output = "" - if len(execution.logs.stdout) > 0: - output += "\n".join(execution.logs.stdout) - if len(execution.logs.stderr) > 0: - output += "\n".join(execution.logs.stderr) - if execution.error is not None: - output += execution.error.traceback + execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data), request_timeout=3) # Execute Python inside the sandbox + + print(f"=== Execution ===\n{execution}\n=== End of Execution ===") + + output = execution.text + + # if len(execution.logs.stdout) > 0: + # output += "\n".join(execution.logs.stdout) + # if len(execution.logs.stderr) > 0: + # output += "\n".join(execution.logs.stderr) + # if execution.error is not None: + # output += execution.error.traceback # convert output to float output = float(output) rewards.append(output) + + print(f"Rewards: {rewards}") return rewards From 6ba5302e42814b6aed22a89096adcf0d67819a51 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 14:15:17 +0000 Subject: [PATCH 03/33] Add more stuff --- .../grpo/config_demo.yaml | 2 + .../grpo/config_demo.yaml | 6 +-- src/open_r1/grpo.py | 2 +- src/open_r1/rewards.py | 38 +++++++++---------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml index 81793939..595b09b5 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml @@ -42,6 +42,8 @@ per_device_train_batch_size: 16 push_to_hub: true report_to: - wandb +reward_funcs: +- format save_strategy: "no" seed: 42 warmup_ratio: 0.1 diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml index c8433c1a..43252e4a 100644 --- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml @@ -8,8 +8,6 @@ attn_implementation: flash_attention_2 dataset_name: open-r1/verifiable-coding-problems-python-1k dataset_configs: - all -# Num processes is less by 1 as vLLM is using 1 GPU -num_processes: 7 # GRPO trainer config bf16: true @@ -38,13 +36,13 @@ num_generations: 7 num_train_epochs: 1 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO overwrite_output_dir: true -per_device_train_batch_size: 1 +per_device_train_batch_size: 2 push_to_hub: false #true report_to: - wandb reward_funcs: +# - format - code -- format save_strategy: "no" seed: 42 warmup_ratio: 0.1 diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 3d8de0dc..592434ae 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -27,12 +27,12 @@ from open_r1.configs import GRPOConfig from open_r1.rewards import ( accuracy_reward, + code_reward, format_reward, get_cosine_scaled_reward, get_repetition_penalty_reward, len_reward, reasoning_steps_reward, - code_reward ) from open_r1.utils.callbacks import get_callbacks from open_r1.utils.wandb_logging import init_wandb_training diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index edb00ad5..00ef3841 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -4,9 +4,9 @@ import re from typing import Dict +from e2b_code_interpreter import Sandbox from latex2sympy2_extended import NormalizationConfig from math_verify import LatexExtractionConfig, parse, verify -from e2b_code_interpreter import Sandbox def accuracy_reward(completions, solution, **kwargs): @@ -274,16 +274,19 @@ def repetition_penalty_reward(completions, **kwargs) -> float: return repetition_penalty_reward -def extract_code(completion : str) -> str: +def extract_code(completion: str) -> str: pattern = re.compile(r"```python\n(.*?)```", re.DOTALL) matches = pattern.findall(completion) extracted_answer = matches[-1] if len(matches) >= 1 else "" return extracted_answer + import json + def code_reward(completions, **kwargs): from dotenv import load_dotenv + load_dotenv() from e2b_code_interpreter import Sandbox @@ -322,29 +325,22 @@ def evaluate_code(code, test_cases): """ code_snippets = [extract_code(completion[-1]["content"]) for completion in completions] verification_info = kwargs["verification_info"] - scripts = [evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) for code, info in zip(code_snippets, verification_info)] + scripts = [ + evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) + for code, info in zip(code_snippets, verification_info) + ] rewards = [] for script in scripts: - print(f"=== Script ===\n{script}\n=== End of Script ===") + # print(f"=== Script ===\n{script}\n=== End of Script ===") - execution = sbx.run_code(script, on_stdout=lambda data: print('stdout:', data), request_timeout=3) # Execute Python inside the sandbox + execution = sbx.run_code( + script, on_stdout=lambda data: print("stdout:", data), request_timeout=3 + ) # Execute Python inside the sandbox - print(f"=== Execution ===\n{execution}\n=== End of Execution ===") + # print(f"=== Execution ===\n{execution}\n=== End of Execution ===") - output = execution.text - - # if len(execution.logs.stdout) > 0: - # output += "\n".join(execution.logs.stdout) - # if len(execution.logs.stderr) > 0: - # output += "\n".join(execution.logs.stderr) - # if execution.error is not None: - # output += execution.error.traceback - - # convert output to float - output = float(output) + output = float(execution.text) rewards.append(output) - - print(f"Rewards: {rewards}") - return rewards - + # print(f"Rewards: {rewards}") + return rewards From 78cf722790eea27443d95238289224a44b0b8311 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 16:17:04 +0000 Subject: [PATCH 04/33] Add fix for parse --- .../grpo/config_demo.yaml | 6 +- src/open_r1/rewards.py | 118 ++++++++++-------- 2 files changed, 70 insertions(+), 54 deletions(-) diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml index 43252e4a..d1fe4db1 100644 --- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml @@ -36,13 +36,15 @@ num_generations: 7 num_train_epochs: 1 output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO overwrite_output_dir: true -per_device_train_batch_size: 2 +per_device_train_batch_size: 1 push_to_hub: false #true report_to: - wandb reward_funcs: -# - format +- format - code save_strategy: "no" seed: 42 warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 00ef3841..9d9ac1df 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -1,14 +1,19 @@ """Reward functions for GRPO training.""" +import json import math import re from typing import Dict +from dotenv import load_dotenv from e2b_code_interpreter import Sandbox from latex2sympy2_extended import NormalizationConfig from math_verify import LatexExtractionConfig, parse, verify +load_dotenv() + + def accuracy_reward(completions, solution, **kwargs): """Reward function that checks if the completion is the same as the ground truth.""" contents = [completion[0]["content"] for completion in completions] @@ -281,66 +286,75 @@ def extract_code(completion: str) -> str: return extracted_answer -import json - - def code_reward(completions, **kwargs): - from dotenv import load_dotenv - - load_dotenv() - from e2b_code_interpreter import Sandbox - - sbx = Sandbox() - """Returns a reward function that evaluates code snippets in a sandbox.""" - evaluation_script_template = """ - import subprocess - import json - - def evaluate_code(code, test_cases): - passed = 0 - total = len(test_cases) - - for case in test_cases: - process = subprocess.run( - ["python3", "-c", code], - input=case["input"], - text=True, - capture_output=True - ) + rewards = [] + try: + sbx = Sandbox(timeout=30, request_timeout=3) + """Returns a reward function that evaluates code snippets in a sandbox.""" + evaluation_script_template = """ + import subprocess + import json + + def evaluate_code(code, test_cases): + passed = 0 + total = len(test_cases) + + for case in test_cases: + process = subprocess.run( + ["python3", "-c", code], + input=case["input"], + text=True, + capture_output=True + ) - if process.returncode != 0: # Error in execution - continue + if process.returncode != 0: # Error in execution + continue - output = process.stdout.strip() - if output == case["output"]: - passed += 1 + output = process.stdout.strip() + print("output") + print(output) + print() + print("case") + print(case["output"]) + print() + print(output.strip() == case["output"].strip()) + if output.strip() == case["output"].strip(): + passed += 1 - success_rate = (passed / total) - return success_rate + success_rate = (passed / total) + return success_rate - code_snippet = {code} - test_cases = json.loads({test_cases}) + code_snippet = {code} + test_cases = json.loads({test_cases}) - evaluate_code(code_snippet, test_cases) - """ - code_snippets = [extract_code(completion[-1]["content"]) for completion in completions] - verification_info = kwargs["verification_info"] - scripts = [ - evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))) - for code, info in zip(code_snippets, verification_info) - ] - rewards = [] - for script in scripts: - # print(f"=== Script ===\n{script}\n=== End of Script ===") + evaluate_code(code_snippet, test_cases) + """ + code_snippets = [extract_code(completion[-1]["content"]) for completion in completions] + # gold_code_snippets = [extract_code(sol) for sol in kwargs["gold_standard_solution"]] + verification_info = kwargs["verification_info"] + scripts = [ + evaluation_script_template.format( + code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])) + ) + for code, info in zip(code_snippets, verification_info) + ] + for script in scripts: + # print(f"=== Script ===\n{script}\n=== End of Script ===") + + execution = sbx.run_code( + script, on_stdout=lambda data: print("stdout:", data) + ) # Execute Python inside the sandbox - execution = sbx.run_code( - script, on_stdout=lambda data: print("stdout:", data), request_timeout=3 - ) # Execute Python inside the sandbox + # print(f"=== Execution ===\n{execution}\n=== End of Execution ===") - # print(f"=== Execution ===\n{execution}\n=== End of Execution ===") + output = float(execution.text) + rewards.append(output) - output = float(execution.text) - rewards.append(output) + # print(f"Rewards: {rewards}") - # print(f"Rewards: {rewards}") + # Shutdown to stay in limits + sbx.kill() + except Exception as e: + print(f"Error: {e}") + rewards = [0.0] * len(completions) return rewards From 24dc34f9c8a60cb7bd4f4a9d3d8adc281085c29b Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 16:47:01 +0000 Subject: [PATCH 05/33] Fix --- .../grpo/config_demo.yaml | 4 +--- src/open_r1/grpo.py | 2 +- src/open_r1/rewards.py | 23 +++++++------------ 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml index d1fe4db1..1440a5b1 100644 --- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml @@ -1,5 +1,5 @@ # Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-1.5B-Instruct +model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 @@ -46,5 +46,3 @@ reward_funcs: save_strategy: "no" seed: 42 warmup_ratio: 0.1 -wandb_entity: huggingface -wandb_project: open-r1 diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 592434ae..9ef7f1cc 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments): "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant " "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning " "process and answer are enclosed within and tags, respectively, i.e., " - " reasoning process here answer here " + " reasoning process here answer here . Make sure the Python code in the answer is enclosed with backticks." ) diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 9d9ac1df..3b4c25f8 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -289,7 +289,6 @@ def extract_code(completion: str) -> str: def code_reward(completions, **kwargs): rewards = [] try: - sbx = Sandbox(timeout=30, request_timeout=3) """Returns a reward function that evaluates code snippets in a sandbox.""" evaluation_script_template = """ import subprocess @@ -338,22 +337,16 @@ def evaluate_code(code, test_cases): ) for code, info in zip(code_snippets, verification_info) ] - for script in scripts: - # print(f"=== Script ===\n{script}\n=== End of Script ===") + with Sandbox(timeout=30, request_timeout=3) as sbx: + for script in scripts: + print("Running code in sandbox") + execution = sbx.run_code(script) + print("Execution completed") - execution = sbx.run_code( - script, on_stdout=lambda data: print("stdout:", data) - ) # Execute Python inside the sandbox + output = float(execution.text) + rewards.append(output) - # print(f"=== Execution ===\n{execution}\n=== End of Execution ===") - - output = float(execution.text) - rewards.append(output) - - # print(f"Rewards: {rewards}") - - # Shutdown to stay in limits - sbx.kill() + # print(f"Rewards: {rewards}") except Exception as e: print(f"Error: {e}") rewards = [0.0] * len(completions) From 22244fe73d6a18d090377d2482d2728a6c3f3f76 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 17:55:37 +0000 Subject: [PATCH 06/33] Refactor --- .../grpo/config_demo.yaml | 8 ++++---- src/open_r1/grpo.py | 2 +- src/open_r1/rewards.py | 13 +++++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) rename recipes/{Qwen2.5-Coder-1.5B-Instruct => Qwen2.5-Coder-3B-Instruct}/grpo/config_demo.yaml (85%) diff --git a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml similarity index 85% rename from recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml rename to recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml index 1440a5b1..6962f3c3 100644 --- a/recipes/Qwen2.5-Coder-1.5B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml @@ -21,9 +21,9 @@ gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -hub_model_id: Qwen2.5-Coder-1.5B-Open-R1-GRPO +hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO hub_strategy: every_save -learning_rate: 1.0e-06 +learning_rate: 1.0e-05 log_completions: true log_level: info logging_steps: 1 @@ -34,9 +34,9 @@ max_completion_length: 2048 max_steps: -1 num_generations: 7 num_train_epochs: 1 -output_dir: data/Qwen2.5-Coder-1.5B-Open-R1-GRPO +output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO overwrite_output_dir: true -per_device_train_batch_size: 1 +per_device_train_batch_size: 2 push_to_hub: false #true report_to: - wandb diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index 9ef7f1cc..abe6d947 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments): "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant " "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning " "process and answer are enclosed within and tags, respectively, i.e., " - " reasoning process here answer here . Make sure the Python code in the answer is enclosed with backticks." + " reasoning process here ```python\n# ANSWER HERE\n``` ." ) diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 3b4c25f8..d82876bf 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -297,13 +297,15 @@ def code_reward(completions, **kwargs): def evaluate_code(code, test_cases): passed = 0 total = len(test_cases) + exec_timeout = 5 for case in test_cases: process = subprocess.run( ["python3", "-c", code], input=case["input"], text=True, - capture_output=True + capture_output=True, + timeout=exec_timeout ) if process.returncode != 0: # Error in execution @@ -338,12 +340,15 @@ def evaluate_code(code, test_cases): for code, info in zip(code_snippets, verification_info) ] with Sandbox(timeout=30, request_timeout=3) as sbx: - for script in scripts: + for code, script in zip(code_snippets, scripts): print("Running code in sandbox") - execution = sbx.run_code(script) + execution = sbx.run_code(script, request_timeout=3) print("Execution completed") - output = float(execution.text) + try: + output = float(execution.text) + except (TypeError, ValueError): + output = 0.0 rewards.append(output) # print(f"Rewards: {rewards}") From c32d1377bf476d5423ff283ef114473fa88a09e0 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 21:15:36 +0000 Subject: [PATCH 07/33] Clean up --- .../grpo/config_demo.yaml | 9 ++++++--- src/open_r1/rewards.py | 15 +-------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml index 6962f3c3..11d40fbd 100644 --- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml @@ -23,7 +23,7 @@ gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO hub_strategy: every_save -learning_rate: 1.0e-05 +learning_rate: 1.0e-06 log_completions: true log_level: info logging_steps: 1 @@ -32,17 +32,20 @@ lr_scheduler_type: cosine max_prompt_length: 1024 max_completion_length: 2048 max_steps: -1 -num_generations: 7 +num_generations: 14 num_train_epochs: 1 output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO overwrite_output_dir: true -per_device_train_batch_size: 2 +per_device_train_batch_size: 8 push_to_hub: false #true report_to: - wandb reward_funcs: - format - code +reward_weights: +- 0.1 +- 1.0 save_strategy: "no" seed: 42 warmup_ratio: 0.1 diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index d82876bf..149e162b 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -312,13 +312,6 @@ def evaluate_code(code, test_cases): continue output = process.stdout.strip() - print("output") - print(output) - print() - print("case") - print(case["output"]) - print() - print(output.strip() == case["output"].strip()) if output.strip() == case["output"].strip(): passed += 1 @@ -331,7 +324,6 @@ def evaluate_code(code, test_cases): evaluate_code(code_snippet, test_cases) """ code_snippets = [extract_code(completion[-1]["content"]) for completion in completions] - # gold_code_snippets = [extract_code(sol) for sol in kwargs["gold_standard_solution"]] verification_info = kwargs["verification_info"] scripts = [ evaluation_script_template.format( @@ -340,18 +332,13 @@ def evaluate_code(code, test_cases): for code, info in zip(code_snippets, verification_info) ] with Sandbox(timeout=30, request_timeout=3) as sbx: - for code, script in zip(code_snippets, scripts): - print("Running code in sandbox") + for script in scripts: execution = sbx.run_code(script, request_timeout=3) - print("Execution completed") - try: output = float(execution.text) except (TypeError, ValueError): output = 0.0 rewards.append(output) - - # print(f"Rewards: {rewards}") except Exception as e: print(f"Error: {e}") rewards = [0.0] * len(completions) From dab15e0567db2fc552a62f2b486c13e6d0e0b625 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 13 Feb 2025 21:41:19 +0000 Subject: [PATCH 08/33] Fix config --- ...ig_demo.yaml => config_codeforces_1k.yaml} | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) rename recipes/Qwen2.5-Coder-3B-Instruct/grpo/{config_demo.yaml => config_codeforces_1k.yaml} (71%) diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml similarity index 71% rename from recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml rename to recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml index 11d40fbd..1e33be08 100644 --- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_demo.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml @@ -10,42 +10,46 @@ dataset_configs: - all # GRPO trainer config +benchmarks: +- gpqa bf16: true -use_vllm: true -vllm_device: auto -vllm_gpu_memory_utilization: 0.7 -do_eval: true -eval_strategy: 'no' #steps -eval_steps: 100 +callbacks: +- push_to_hub_revision +eval_strategy: 'no' gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -hub_model_id: Qwen2.5-Coder-3B-Open-R1-GRPO +hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO hub_strategy: every_save learning_rate: 1.0e-06 log_completions: true log_level: info logging_steps: 1 logging_strategy: steps -lr_scheduler_type: cosine +lr_scheduler_type: constant max_prompt_length: 1024 max_completion_length: 2048 max_steps: -1 num_generations: 14 num_train_epochs: 1 -output_dir: data/Qwen2.5-Coder-3B-Open-R1-GRPO +output_dir: data/Qwen2.5-Coder-3B-GRPO overwrite_output_dir: true -per_device_train_batch_size: 8 -push_to_hub: false #true +per_device_train_batch_size: 16 +push_to_hub: true report_to: - wandb reward_funcs: - format - code reward_weights: -- 0.1 +- 0.25 - 1.0 -save_strategy: "no" +save_strategy: "steps" +save_steps: 10 +save_total_limit: 1 seed: 42 -warmup_ratio: 0.1 +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +warmup_ratio: 0.0 From edc502d00e4210a728458a2fab15244f80a58b6b Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Fri, 14 Feb 2025 09:12:22 +0000 Subject: [PATCH 09/33] Fix sys --- src/open_r1/grpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py index abe6d947..592434ae 100644 --- a/src/open_r1/grpo.py +++ b/src/open_r1/grpo.py @@ -103,7 +103,7 @@ class GRPOScriptArguments(ScriptArguments): "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant " "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning " "process and answer are enclosed within and tags, respectively, i.e., " - " reasoning process here ```python\n# ANSWER HERE\n``` ." + " reasoning process here answer here " ) From 27af68edce3b36fcd722a22dc7968240784a0a05 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 15 Feb 2025 21:37:02 +0000 Subject: [PATCH 10/33] Add SFT config --- .../sft/config_demo.yaml | 2 +- .../sft/config_openthoughts_code.yaml | 49 +++++++++++++++++++ slurm/train.slurm | 8 +-- src/open_r1/sft.py | 2 + 4 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml index c7dd25bb..fb5830b9 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml @@ -8,7 +8,7 @@ attn_implementation: flash_attention_2 dataset_name: HuggingFaceH4/Bespoke-Stratos-17k dataset_configs: - all -preprocessing_num_workers: 8 +dataset_num_proc: 8 # SFT trainer config bf16: true diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml new file mode 100644 index 00000000..ec5e342b --- /dev/null +++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml @@ -0,0 +1,49 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenThoughts-114k-code +dataset_configs: +- all +dataset_num_proc: 48 + +# SFT trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- gpqa +bf16: true +do_eval: true +eval_strategy: epoch +gradient_accumulation_steps: 4 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_level: info +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +packing: true +max_seq_length: 32768 +max_steps: -1 +num_train_epochs: 3 +output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00 +overwrite_output_dir: true +per_device_eval_batch_size: 4 +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +save_strategy: "epoch" +save_total_limit: 1 +seed: 42 +use_liger: true +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.03 \ No newline at end of file diff --git a/slurm/train.slurm b/slurm/train.slurm index c10a2a23..c212afed 100644 --- a/slurm/train.slurm +++ b/slurm/train.slurm @@ -32,11 +32,11 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE)) # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}') -USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace) +# USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace) -if [ -n "$USE_VLLM" ]; then # Check if USE_VLLM is *not* empty (found) - WORLD_SIZE=$(($WORLD_SIZE-1)) -fi +# if [ -n "$USE_VLLM" ]; then # Check if USE_VLLM is *not* empty (found) +# WORLD_SIZE=$(($WORLD_SIZE-1)) +# fi # Split the string into individual arguments IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS" diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py index b6031d81..91bf74d3 100644 --- a/src/open_r1/sft.py +++ b/src/open_r1/sft.py @@ -67,6 +67,8 @@ def main(script_args, training_args, model_args): # Set seed for reproducibility set_seed(training_args.seed) + training_args.lr_scheduler_kwargs = {"min_lr": training_args.learning_rate * 0.1} + ############### # Setup logging ############### From 53eaddb639c3dc7d08e3cf9f45a18bec63427eba Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sat, 15 Feb 2025 21:52:50 +0000 Subject: [PATCH 11/33] Use min rate --- .../Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml | 2 ++ src/open_r1/sft.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml index ec5e342b..3a6b7583 100644 --- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml @@ -29,6 +29,8 @@ log_level: info logging_steps: 1 logging_strategy: steps lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 packing: true max_seq_length: 32768 max_steps: -1 diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py index 91bf74d3..b6031d81 100644 --- a/src/open_r1/sft.py +++ b/src/open_r1/sft.py @@ -67,8 +67,6 @@ def main(script_args, training_args, model_args): # Set seed for reproducibility set_seed(training_args.seed) - training_args.lr_scheduler_kwargs = {"min_lr": training_args.learning_rate * 0.1} - ############### # Setup logging ############### From 385d79988018a072b265cd53693610f97e1d277c Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 16 Feb 2025 07:30:51 +0000 Subject: [PATCH 12/33] Fix eval --- .../Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml index 3a6b7583..47d23bd4 100644 --- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml @@ -37,7 +37,7 @@ max_steps: -1 num_train_epochs: 3 output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00 overwrite_output_dir: true -per_device_eval_batch_size: 4 +per_device_eval_batch_size: 1 per_device_train_batch_size: 4 push_to_hub: true report_to: From 52fc68198579748b63b16f1ce8733b271f1d2069 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 16 Feb 2025 21:25:50 +0000 Subject: [PATCH 13/33] Add base model --- .../grpo/config_codeforces_1k.yaml | 55 +++++++++++++++++++ .../sft/config_openthoughts_code.yaml | 51 +++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml create mode 100644 recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml diff --git a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml new file mode 100644 index 00000000..1e33be08 --- /dev/null +++ b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml @@ -0,0 +1,55 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python-1k +dataset_configs: +- all + +# GRPO trainer config +benchmarks: +- gpqa +bf16: true +callbacks: +- push_to_hub_revision +eval_strategy: 'no' +gradient_accumulation_steps: 1 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant +max_prompt_length: 1024 +max_completion_length: 2048 +max_steps: -1 +num_generations: 14 +num_train_epochs: 1 +output_dir: data/Qwen2.5-Coder-3B-GRPO +overwrite_output_dir: true +per_device_train_batch_size: 16 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- format +- code +reward_weights: +- 0.25 +- 1.0 +save_strategy: "steps" +save_steps: 10 +save_total_limit: 1 +seed: 42 +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +warmup_ratio: 0.0 diff --git a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml new file mode 100644 index 00000000..33e37136 --- /dev/null +++ b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml @@ -0,0 +1,51 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-3B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenThoughts-114k-code +dataset_configs: +- all +dataset_num_proc: 48 + +# SFT trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- gpqa +bf16: true +do_eval: true +eval_strategy: epoch +gradient_accumulation_steps: 4 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_level: info +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +packing: true +max_seq_length: 32768 +max_steps: -1 +num_train_epochs: 3 +output_dir: data/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00 +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +save_strategy: "epoch" +save_total_limit: 1 +seed: 42 +use_liger: true +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.03 \ No newline at end of file From 884387f1a04efe7ad228c3190a341f5a5f5b05a3 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 16 Feb 2025 22:13:54 +0000 Subject: [PATCH 14/33] Add s1k --- .../sft/config_s1k.yaml | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml new file mode 100644 index 00000000..516a0c59 --- /dev/null +++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml @@ -0,0 +1,52 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/s1K-1.1 +dataset_configs: +- all +dataset_num_proc: 48 + +# SFT trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- gpqa +- math_500 +bf16: true +do_eval: false +eval_strategy: epoch +gradient_accumulation_steps: 1 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_level: info +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +packing: true +max_seq_length: 32768 +max_steps: -1 +num_train_epochs: 15 +output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00 +overwrite_output_dir: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 2 +push_to_hub: true +report_to: +- wandb +save_strategy: "epoch" +save_total_limit: 1 +seed: 42 +use_liger: true +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.05 \ No newline at end of file From 2d3c79794fad8716a252a5b8f16a9a3d53603f98 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 17 Feb 2025 11:07:57 +0000 Subject: [PATCH 15/33] Disable eval --- recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml index 516a0c59..ccfb2139 100644 --- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml +++ b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml @@ -18,7 +18,7 @@ benchmarks: - math_500 bf16: true do_eval: false -eval_strategy: epoch +eval_strategy: 'no' gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: From aaa8f6f7c994b484f1a4b90ebc8be21f3cbc562e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 14:57:06 +0000 Subject: [PATCH 16/33] Fix --- .../grpo/config_code_demo.yaml | 60 +++++++++++++++++++ src/open_r1/rewards.py | 6 +- 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml new file mode 100644 index 00000000..9b26bb2a --- /dev/null +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml @@ -0,0 +1,60 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python-10k +dataset_configs: +- default +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.9 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO +hub_strategy: every_save +learning_rate: 5.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_prompt_length: 1024 +max_completion_length: 7168 +max_steps: 1000 +num_generations: 14 +num_train_epochs: 1 +output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO +overwrite_output_dir: true +per_device_eval_batch_size: 4 +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 10 +save_total_limit: 1 +seed: 42 +temperature: 1.0 +warmup_ratio: 0.03 +wandb_entity: huggingface +wandb_project: open-r1 \ No newline at end of file diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 0e39d9d2..a0eb2b56 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -287,6 +287,10 @@ def extract_code(completion: str) -> str: def code_reward(completions, **kwargs): + """Reward function that evaluates code snippets using the E2B code interpreter. + + Assumes the dataset contains a `verification_info` column with test cases. + """ rewards = [] try: """Returns a reward function that evaluates code snippets in a sandbox.""" @@ -340,6 +344,6 @@ def evaluate_code(code, test_cases): output = 0.0 rewards.append(output) except Exception as e: - print(f"Error: {e}") + print(f"Error from E2B executor: {e}") rewards = [0.0] * len(completions) return rewards From 20a1ea0690b4d83b4b7468bcc63827455e9b8c4a Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 15:13:45 +0000 Subject: [PATCH 17/33] Add import checker --- ...g_code_demo.yaml => config_demo_code.yaml} | 6 +++-- src/open_r1/rewards.py | 17 ++++++++++---- src/open_r1/utils/__init__.py | 3 ++- src/open_r1/utils/import_utils.py | 23 +++++++++++++++++++ 4 files changed, 41 insertions(+), 8 deletions(-) rename recipes/Qwen2.5-1.5B-Instruct/grpo/{config_code_demo.yaml => config_demo_code.yaml} (93%) create mode 100644 src/open_r1/utils/import_utils.py diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml similarity index 93% rename from recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml rename to recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 9b26bb2a..3f85c66a 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_code_demo.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -11,13 +11,15 @@ dataset_configs: system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" # GRPO trainer config +callbacks: +- push_to_hub_revision beta: 0.001 bf16: true use_vllm: true vllm_device: auto vllm_gpu_memory_utilization: 0.9 do_eval: false -gradient_accumulation_steps: 16 +gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false @@ -40,7 +42,7 @@ num_train_epochs: 1 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO overwrite_output_dir: true per_device_eval_batch_size: 4 -per_device_train_batch_size: 8 +per_device_train_batch_size: 16 push_to_hub: true report_to: - wandb diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index a0eb2b56..49de06e2 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -5,13 +5,14 @@ import re from typing import Dict -from dotenv import load_dotenv -from e2b_code_interpreter import Sandbox from latex2sympy2_extended import NormalizationConfig from math_verify import LatexExtractionConfig, parse, verify +from .utils import is_e2b_available - -load_dotenv() +if is_e2b_available(): + from dotenv import load_dotenv + from e2b_code_interpreter import Sandbox + load_dotenv() def accuracy_reward(completions, solution, **kwargs): @@ -337,13 +338,19 @@ def evaluate_code(code, test_cases): ] with Sandbox(timeout=30, request_timeout=3) as sbx: for script in scripts: - execution = sbx.run_code(script, request_timeout=3) + print("Running script") + execution = sbx.run_code(script) + print("Script run") try: output = float(execution.text) except (TypeError, ValueError): output = 0.0 + + print(f"Output: {output}") rewards.append(output) except Exception as e: print(f"Error from E2B executor: {e}") rewards = [0.0] * len(completions) + + print("Rewards finished!") return rewards diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py index b1de213d..da3ec481 100644 --- a/src/open_r1/utils/__init__.py +++ b/src/open_r1/utils/__init__.py @@ -1,4 +1,5 @@ from .model_utils import get_tokenizer +from .import_utils import is_e2b_available -__all__ = ["get_tokenizer"] +__all__ = ["get_tokenizer", "is_e2b_available"] \ No newline at end of file diff --git a/src/open_r1/utils/import_utils.py b/src/open_r1/utils/import_utils.py new file mode 100644 index 00000000..8893264a --- /dev/null +++ b/src/open_r1/utils/import_utils.py @@ -0,0 +1,23 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers.utils.import_utils import _is_package_available + + +# Use same as transformers.utils.import_utils +_e2b_available = _is_package_available("e2b") + + +def is_e2b_available() -> bool: + return _e2b_available From 58633036fa78f2669f5e6c531b0c76f07548cc26 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 15:41:49 +0000 Subject: [PATCH 18/33] Fix importer --- .../Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 7 +++---- src/open_r1/rewards.py | 11 ++++------- src/open_r1/utils/__init__.py | 4 ++-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 3f85c66a..cb31a62f 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -19,11 +19,11 @@ use_vllm: true vllm_device: auto vllm_gpu_memory_utilization: 0.9 do_eval: false -gradient_accumulation_steps: 1 +gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO +hub_model_id: lewtun/Qwen2.5-1.5B-Open-R1-Code-GRPO hub_strategy: every_save learning_rate: 5.0e-06 log_completions: true @@ -35,13 +35,12 @@ lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 max_prompt_length: 1024 -max_completion_length: 7168 +max_completion_length: 2048 max_steps: 1000 num_generations: 14 num_train_epochs: 1 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO overwrite_output_dir: true -per_device_eval_batch_size: 4 per_device_train_batch_size: 16 push_to_hub: true report_to: diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 49de06e2..7b99037f 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -7,11 +7,14 @@ from latex2sympy2_extended import NormalizationConfig from math_verify import LatexExtractionConfig, parse, verify + from .utils import is_e2b_available + if is_e2b_available(): from dotenv import load_dotenv from e2b_code_interpreter import Sandbox + load_dotenv() @@ -287,7 +290,7 @@ def extract_code(completion: str) -> str: return extracted_answer -def code_reward(completions, **kwargs): +def code_reward(completions, **kwargs) -> list[float]: """Reward function that evaluates code snippets using the E2B code interpreter. Assumes the dataset contains a `verification_info` column with test cases. @@ -338,19 +341,13 @@ def evaluate_code(code, test_cases): ] with Sandbox(timeout=30, request_timeout=3) as sbx: for script in scripts: - print("Running script") execution = sbx.run_code(script) - print("Script run") try: output = float(execution.text) except (TypeError, ValueError): output = 0.0 - - print(f"Output: {output}") rewards.append(output) except Exception as e: print(f"Error from E2B executor: {e}") rewards = [0.0] * len(completions) - - print("Rewards finished!") return rewards diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py index da3ec481..5302463e 100644 --- a/src/open_r1/utils/__init__.py +++ b/src/open_r1/utils/__init__.py @@ -1,5 +1,5 @@ -from .model_utils import get_tokenizer from .import_utils import is_e2b_available +from .model_utils import get_tokenizer -__all__ = ["get_tokenizer", "is_e2b_available"] \ No newline at end of file +__all__ = ["get_tokenizer", "is_e2b_available"] From 8d78b8efdbb47ad2fd00ae38c00157b9492dc10e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:31:10 +0000 Subject: [PATCH 19/33] Fix --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index cb31a62f..0c737f24 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -36,7 +36,7 @@ lr_scheduler_kwargs: min_lr_rate: 0.1 max_prompt_length: 1024 max_completion_length: 2048 -max_steps: 1000 +max_steps: 500 num_generations: 14 num_train_epochs: 1 output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO From 932e69e82a5f0ce735145444d02c6d0e1142b7b6 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:31:37 +0000 Subject: [PATCH 20/33] Tune config --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 0c737f24..f379c112 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -34,7 +34,7 @@ logging_strategy: steps lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 -max_prompt_length: 1024 +max_prompt_length: 512 max_completion_length: 2048 max_steps: 500 num_generations: 14 From 258406f771d221d6ac59d0fcedb7827ed39996e2 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:31:58 +0000 Subject: [PATCH 21/33] Tune --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index f379c112..0c737f24 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -34,7 +34,7 @@ logging_strategy: steps lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 -max_prompt_length: 512 +max_prompt_length: 1024 max_completion_length: 2048 max_steps: 500 num_generations: 14 From fd9860e3d4c2a15f94dc78df076420f0c8dcf566 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:32:35 +0000 Subject: [PATCH 22/33] Fix --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 0c737f24..202ae86a 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -23,9 +23,9 @@ gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -hub_model_id: lewtun/Qwen2.5-1.5B-Open-R1-Code-GRPO +hub_model_id: open-r1/Qwen2.5-1.5B-Open-R1-Code-GRPO hub_strategy: every_save -learning_rate: 5.0e-06 +learning_rate: 5.0e-07 log_completions: true log_level: info logging_first_step: true From c614dbd0b5fddee55260e8c696ebfc794cc4e471 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:36:16 +0000 Subject: [PATCH 23/33] Fix save --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 202ae86a..6e3ca4ff 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -52,7 +52,7 @@ reward_weights: - 1.0 - 0.1 save_strategy: "steps" -save_steps: 10 +save_steps: 50 save_total_limit: 1 seed: 42 temperature: 1.0 From 51815b2eb6b3179b4ab61bdd1fb4ae6893a46b52 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:38:45 +0000 Subject: [PATCH 24/33] Tuen beta --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 6e3ca4ff..46ee165a 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -13,7 +13,7 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d # GRPO trainer config callbacks: - push_to_hub_revision -beta: 0.001 +beta: 0.01 bf16: true use_vllm: true vllm_device: auto From da0840799bfea50c841f39f135f65b38f33bb383 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:43:20 +0000 Subject: [PATCH 25/33] Remove configs --- .../grpo/config_codeforces_1k.yaml | 55 ------------------- .../sft/config_openthoughts_code.yaml | 51 ----------------- .../sft/config_s1k.yaml | 52 ------------------ .../grpo/config_codeforces_1k.yaml | 55 ------------------- .../sft/config_openthoughts_code.yaml | 51 ----------------- 5 files changed, 264 deletions(-) delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml delete mode 100644 recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml delete mode 100644 recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml delete mode 100644 recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml deleted file mode 100644 index 1e33be08..00000000 --- a/recipes/Qwen2.5-Coder-3B-Instruct/grpo/config_codeforces_1k.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct -model_revision: main -torch_dtype: bfloat16 -attn_implementation: flash_attention_2 - -# Data training arguments -dataset_name: open-r1/verifiable-coding-problems-python-1k -dataset_configs: -- all - -# GRPO trainer config -benchmarks: -- gpqa -bf16: true -callbacks: -- push_to_hub_revision -eval_strategy: 'no' -gradient_accumulation_steps: 1 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO -hub_strategy: every_save -learning_rate: 1.0e-06 -log_completions: true -log_level: info -logging_steps: 1 -logging_strategy: steps -lr_scheduler_type: constant -max_prompt_length: 1024 -max_completion_length: 2048 -max_steps: -1 -num_generations: 14 -num_train_epochs: 1 -output_dir: data/Qwen2.5-Coder-3B-GRPO -overwrite_output_dir: true -per_device_train_batch_size: 16 -push_to_hub: true -report_to: -- wandb -reward_funcs: -- format -- code -reward_weights: -- 0.25 -- 1.0 -save_strategy: "steps" -save_steps: 10 -save_total_limit: 1 -seed: 42 -use_vllm: true -vllm_device: auto -vllm_gpu_memory_utilization: 0.7 -warmup_ratio: 0.0 diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml deleted file mode 100644 index 47d23bd4..00000000 --- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_openthoughts_code.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct -model_revision: main -torch_dtype: bfloat16 -attn_implementation: flash_attention_2 - -# Data training arguments -dataset_name: open-r1/OpenThoughts-114k-code -dataset_configs: -- all -dataset_num_proc: 48 - -# SFT trainer config -callbacks: -- push_to_hub_revision -benchmarks: -- gpqa -bf16: true -do_eval: true -eval_strategy: epoch -gradient_accumulation_steps: 4 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00 -hub_strategy: every_save -learning_rate: 5.0e-07 -log_level: info -logging_steps: 1 -logging_strategy: steps -lr_scheduler_type: cosine_with_min_lr -lr_scheduler_kwargs: - min_lr_rate: 0.1 -packing: true -max_seq_length: 32768 -max_steps: -1 -num_train_epochs: 3 -output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-openthoughts-code-v00.00 -overwrite_output_dir: true -per_device_eval_batch_size: 1 -per_device_train_batch_size: 4 -push_to_hub: true -report_to: -- wandb -save_strategy: "epoch" -save_total_limit: 1 -seed: 42 -use_liger: true -wandb_entity: huggingface -wandb_project: open-r1 -warmup_ratio: 0.03 \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml b/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml deleted file mode 100644 index ccfb2139..00000000 --- a/recipes/Qwen2.5-Coder-3B-Instruct/sft/config_s1k.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct -model_revision: main -torch_dtype: bfloat16 -attn_implementation: flash_attention_2 - -# Data training arguments -dataset_name: open-r1/s1K-1.1 -dataset_configs: -- all -dataset_num_proc: 48 - -# SFT trainer config -callbacks: -- push_to_hub_revision -benchmarks: -- gpqa -- math_500 -bf16: true -do_eval: false -eval_strategy: 'no' -gradient_accumulation_steps: 1 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: open-r1/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00 -hub_strategy: every_save -learning_rate: 1.0e-06 -log_level: info -logging_steps: 1 -logging_strategy: steps -lr_scheduler_type: cosine_with_min_lr -lr_scheduler_kwargs: - min_lr_rate: 0.1 -packing: true -max_seq_length: 32768 -max_steps: -1 -num_train_epochs: 15 -output_dir: data/Qwen2.5-Coder-3B-Instruct-SFT-s1k-v00.00 -overwrite_output_dir: true -per_device_eval_batch_size: 1 -per_device_train_batch_size: 2 -push_to_hub: true -report_to: -- wandb -save_strategy: "epoch" -save_total_limit: 1 -seed: 42 -use_liger: true -wandb_entity: huggingface -wandb_project: open-r1 -warmup_ratio: 0.05 \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml b/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml deleted file mode 100644 index 1e33be08..00000000 --- a/recipes/Qwen2.5-Coder-3B/grpo/config_codeforces_1k.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-3B-Instruct -model_revision: main -torch_dtype: bfloat16 -attn_implementation: flash_attention_2 - -# Data training arguments -dataset_name: open-r1/verifiable-coding-problems-python-1k -dataset_configs: -- all - -# GRPO trainer config -benchmarks: -- gpqa -bf16: true -callbacks: -- push_to_hub_revision -eval_strategy: 'no' -gradient_accumulation_steps: 1 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: open-r1/Qwen2.5-Coder-3B-GRPO -hub_strategy: every_save -learning_rate: 1.0e-06 -log_completions: true -log_level: info -logging_steps: 1 -logging_strategy: steps -lr_scheduler_type: constant -max_prompt_length: 1024 -max_completion_length: 2048 -max_steps: -1 -num_generations: 14 -num_train_epochs: 1 -output_dir: data/Qwen2.5-Coder-3B-GRPO -overwrite_output_dir: true -per_device_train_batch_size: 16 -push_to_hub: true -report_to: -- wandb -reward_funcs: -- format -- code -reward_weights: -- 0.25 -- 1.0 -save_strategy: "steps" -save_steps: 10 -save_total_limit: 1 -seed: 42 -use_vllm: true -vllm_device: auto -vllm_gpu_memory_utilization: 0.7 -warmup_ratio: 0.0 diff --git a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml b/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml deleted file mode 100644 index 33e37136..00000000 --- a/recipes/Qwen2.5-Coder-3B/sft/config_openthoughts_code.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Model arguments -model_name_or_path: Qwen/Qwen2.5-Coder-3B -model_revision: main -torch_dtype: bfloat16 -attn_implementation: flash_attention_2 - -# Data training arguments -dataset_name: open-r1/OpenThoughts-114k-code -dataset_configs: -- all -dataset_num_proc: 48 - -# SFT trainer config -callbacks: -- push_to_hub_revision -benchmarks: -- gpqa -bf16: true -do_eval: true -eval_strategy: epoch -gradient_accumulation_steps: 4 -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false -hub_model_id: open-r1/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00 -hub_strategy: every_save -learning_rate: 5.0e-07 -log_level: info -logging_steps: 1 -logging_strategy: steps -lr_scheduler_type: cosine_with_min_lr -lr_scheduler_kwargs: - min_lr_rate: 0.1 -packing: true -max_seq_length: 32768 -max_steps: -1 -num_train_epochs: 3 -output_dir: data/Qwen2.5-Coder-3B-SFT-openthoughts-code-v00.00 -overwrite_output_dir: true -per_device_eval_batch_size: 1 -per_device_train_batch_size: 4 -push_to_hub: true -report_to: -- wandb -save_strategy: "epoch" -save_total_limit: 1 -seed: 42 -use_liger: true -wandb_entity: huggingface -wandb_project: open-r1 -warmup_ratio: 0.03 \ No newline at end of file From 5f35a61b64f7de1ddfa5514d398aebfa548ac572 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:46:36 +0000 Subject: [PATCH 26/33] Fix vLLM --- slurm/train.slurm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/slurm/train.slurm b/slurm/train.slurm index c212afed..c10a2a23 100644 --- a/slurm/train.slurm +++ b/slurm/train.slurm @@ -32,11 +32,11 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE)) # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}') -# USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace) +USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace) -# if [ -n "$USE_VLLM" ]; then # Check if USE_VLLM is *not* empty (found) -# WORLD_SIZE=$(($WORLD_SIZE-1)) -# fi +if [ -n "$USE_VLLM" ]; then # Check if USE_VLLM is *not* empty (found) + WORLD_SIZE=$(($WORLD_SIZE-1)) +fi # Split the string into individual arguments IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS" From 93254b46c16d25944c4ae2bc8e9b52cd5cf5f52e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 16:50:20 +0000 Subject: [PATCH 27/33] Fix --- setup.py | 3 +++ src/open_r1/rewards.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/setup.py b/setup.py index 231de49a..907269c2 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ "datasets>=3.2.0", "deepspeed==0.15.4", "distilabel[vllm,ray,openai]>=1.5.2", + "e2b-code-interpreter>=1.0.5", "einops>=0.8.0", "flake8>=6.0.0", "flash_attn>=2.7.4.post1", @@ -60,6 +61,7 @@ "parameterized>=0.9.0", "peft>=0.14.0", "pytest", + "python-dotenv", "ruff>=0.9.0", "safetensors>=0.3.3", "sentencepiece>=0.1.99", @@ -88,6 +90,7 @@ def deps_list(*pkgs): extras["torch"] = deps_list("torch") extras["quality"] = deps_list("ruff", "isort", "flake8") extras["train"] = deps_list("flash_attn") +extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv") extras["eval"] = deps_list("lighteval", "math-verify") extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"] diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 7b99037f..d9beeb60 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -295,6 +295,12 @@ def code_reward(completions, **kwargs) -> list[float]: Assumes the dataset contains a `verification_info` column with test cases. """ + if not is_e2b_available(): + raise ImportError( + "E2B is not available and required for this reward function. Please install E2B with " + "`pip install e2b-code-interpreter` and add an API key to a `.env` file." + ) + rewards = [] try: """Returns a reward function that evaluates code snippets in a sandbox.""" From 853e42b875de293fc52d08da3c9ef507923b32c6 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 18 Feb 2025 22:34:28 +0000 Subject: [PATCH 28/33] Add note --- src/open_r1/rewards.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index d9beeb60..4440335e 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -302,6 +302,7 @@ def code_reward(completions, **kwargs) -> list[float]: ) rewards = [] + # TODO: add support for other languages in E2B: https://e2b.dev/docs/code-interpreting/supported-languages try: """Returns a reward function that evaluates code snippets in a sandbox.""" evaluation_script_template = """ From 23dfafdb151b4a60d4c99c35ca8a10649ce2f44f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 19 Feb 2025 09:33:34 +0000 Subject: [PATCH 29/33] Add doc --- README.md | 29 +++++++++++++++++++ .../grpo/config_demo_code.yaml | 6 ++-- src/open_r1/rewards.py | 2 +- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a14f180f..40fd8c96 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,35 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model. +#### Training with a code interpreter + +We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies: + +```shell +uv pip install -e '.[code] +``` + +Then create a `.env` file and place an API token from E2B within it: + +``` +E2B_API_KEY="e2b_xxx" +``` + +Then make sure your dataset contains a `verification_info` column with the following schema (adopted from PrimeIntellect's excellent [datasets](https://huggingface.co/collections/PrimeIntellect/synthetic-1-67a2c399cfdd6c9f7fae0c37) of verifiable problems): + +```python +{ + "language": "python", + "test_cases": [ + { + "input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n", + "output": "1\n3 \n-1\n0\n\n2\n1 2 \n", + "type": "stdin_stdout", + } + ], +} +``` + ### Launching jobs on a Slurm cluster If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it: diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index 46ee165a..ed3e9daf 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -23,7 +23,7 @@ gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -hub_model_id: open-r1/Qwen2.5-1.5B-Open-R1-Code-GRPO +hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO hub_strategy: every_save learning_rate: 5.0e-07 log_completions: true @@ -56,6 +56,4 @@ save_steps: 50 save_total_limit: 1 seed: 42 temperature: 1.0 -warmup_ratio: 0.03 -wandb_entity: huggingface -wandb_project: open-r1 \ No newline at end of file +warmup_ratio: 0.03 \ No newline at end of file diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index 4440335e..c003f4d3 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -348,7 +348,7 @@ def evaluate_code(code, test_cases): ] with Sandbox(timeout=30, request_timeout=3) as sbx: for script in scripts: - execution = sbx.run_code(script) + execution = sbx.run_code(script, language=verification_info["language"]) try: output = float(execution.text) except (TypeError, ValueError): From 65c44d89700029049a03602b244939cc8b5dd7b8 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 19 Feb 2025 09:50:51 +0000 Subject: [PATCH 30/33] doc --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40fd8c96..5f82d72e 100644 --- a/README.md +++ b/README.md @@ -170,12 +170,12 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model. -#### Training with a code interpreter +#### 👨‍💻 Training with a code interpreter We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies: ```shell -uv pip install -e '.[code] +uv pip install -e '.[code]' ``` Then create a `.env` file and place an API token from E2B within it: From 04381cab1c2862c0ca45998c3ff8a732b4fa2a79 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 19 Feb 2025 09:51:30 +0000 Subject: [PATCH 31/33] Fix --- recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index ed3e9daf..b82cd8f2 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -11,8 +11,6 @@ dataset_configs: system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" # GRPO trainer config -callbacks: -- push_to_hub_revision beta: 0.01 bf16: true use_vllm: true From fb6e4ae8b37106a097b6e51acd42bf6428499a15 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 19 Feb 2025 09:52:58 +0000 Subject: [PATCH 32/33] Tune lr --- README.md | 2 ++ recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f82d72e..6201c22d 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,8 @@ Then make sure your dataset contains a `verification_info` column with the follo } ``` +See the [training config](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml) for an example with `Qwen2.5-1.5B-Instruct`. + ### Launching jobs on a Slurm cluster If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it: diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml index b82cd8f2..783a4d2a 100644 --- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml @@ -23,7 +23,7 @@ gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO hub_strategy: every_save -learning_rate: 5.0e-07 +learning_rate: 5.0e-06 log_completions: true log_level: info logging_first_step: true From 89ded4353c51a15ad1b1b65a373fd42d2874f1ac Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 19 Feb 2025 10:03:01 +0000 Subject: [PATCH 33/33] Add command --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6201c22d..1111231a 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,13 @@ Then make sure your dataset contains a `verification_info` column with the follo } ``` -See the [training config](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml) for an example with `Qwen2.5-1.5B-Instruct`. +For example, to train a smol model on Python problems, run: + +```shell +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \ + --num_processes=7 src/open_r1/grpo.py \ + --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml +``` ### Launching jobs on a Slurm cluster