diff --git a/examples/rec_gsm8k/README.md b/examples/rec_gsm8k/README.md index c0d9376db1..b3a5673463 100644 --- a/examples/rec_gsm8k/README.md +++ b/examples/rec_gsm8k/README.md @@ -1,41 +1,40 @@ -# Example: REC on GSM8k dataset +# Example: group-relative REINFORCE variants on GSM8k dataset -This example shows the usage of REC on the [GSM8k dataset](https://huggingface.co/datasets/openai/gsm8k). +This example shows the usage of group-relative REINFORCE variants on the [GSM8k dataset](https://huggingface.co/datasets/openai/gsm8k). -For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_reasoning_basic.md). +For more details about algorithm design, please refer to [our paper](https://arxiv.org/abs/2509.24203). The config file is located in [`gsm8k.yaml`](gsm8k.yaml). -# Group-relative REINFORCE Families -This folder provides **example configurations** for running different group-relative REINFORCE families within Trinity-RFT. +## Group-relative REINFORCE variants +This folder provides example configurations for running different group-relative REINFORCE variants within Trinity-RFT. It includes three major families: -- **REC family** (clipping + importance sampling) -- **REP family** (regularization-based variants) -- **RED family** (data-distribution shaping strategies) +- **REC family** (regularization by clipping) +- **REP family** (regularization by an additive loss term) +- **RED family** (actively shaping data distribution) -We also provide baseline implementations such as **Vanilla REINFORCE** and **GRPO**. +These include baseline algorithms like vanilla REINFORCE and GRPO as special cases. All algorithms are instantiated through modular YAML configs for easy reproduction and extension. -# Summary Table 📝 +## Summary Table 📝 | Family | Variants | Key Idea | | ------------- | ----------------------------------------------- | ----------------------------------- | -| **Baselines** | REINFORCE, GRPO | Standard references | -| **REC** | OneSide-NoIS, OneSide-IS, TwoSide-IS, Ring-NoIS | Clipping + importance sampling | -| **REP** | AsymRE, OPMD | Regularization | -| **RED** | Drop, Weight | Data-distribution shaping | +| **Baselines** | REINFORCE, GRPO | Standard references | +| **REC** | OneSide/TwoSide/Ring-IS/NoIS | Clipping as regularization, with or without importance sampling | +| **REP** | AsymRE, OPMD | Regularization by an additive loss term | +| **RED** | Drop, Weight | Actively shaping data distribution | -# Instantiations +## Instantiations -## Baselines +### Baselines -### REINFORCE -Vanilla REINFORCE with group mean as baseline. +**Vanilla REINFORCE** with group mean as baseline: ``` algorithm: @@ -52,8 +51,7 @@ algorithm: std_normalize: false ``` -### GRPO -GRPO implemented with zero KL regularizer. Regularization can be enabled via `kl_loss_fn` and `kl_loss_fn_args`. +**GRPO** with KL regularization (enabled via `kl_loss_fn` and `kl_loss_fn_args`): ``` algorithm: @@ -71,17 +69,11 @@ algorithm: kl_loss_fn: 'k2' kl_loss_fn_args: kl_coef: 0.0 - ``` -## REC family -Variants of clipping and importance-sampling strategies. -- REC-OneSide-NoIS -- REC-OneSide-IS -- REC-TwoSide-IS -- REC-Ring-NoIS +### REC family -### REC-OneSide-NoIS +**REC-OneSide-NoIS:** ``` algorithm: @@ -98,7 +90,7 @@ algorithm: std_normalize: false ``` -### REC-OneSide-IS +**REC-OneSide-IS:** ``` algorithm: @@ -115,7 +107,7 @@ algorithm: std_normalize: false ``` -### REC-TwoSide-IS +**REC-TwoSide-IS:** ``` algorithm: @@ -131,7 +123,8 @@ algorithm: advantage_fn_args: std_normalize: false ``` -### REC-Ring-NoIS + +**REC-Ring-NoIS:** ``` algorithm: @@ -150,13 +143,10 @@ algorithm: std_normalize: false ``` -## REP family +### REP family -Regularization-based algorithms. -- AsymRE (forward KL regularization) -- Kimi’s OPMD (k2 regularizer) -### AsymRE +**Meta's AsymRE:** ``` algorithm: @@ -172,7 +162,7 @@ algorithm: ``` -### Kimi's OPMD +**Kimi's OPMD:** ``` algorithm: @@ -186,12 +176,10 @@ algorithm: std_normalize: false ``` -## RED family -Data-distribution shaping variants. -- RED-Drop (drop extra negative examples to balance the positive examples v.s. negative examples) -- RED-Weight (advantage-weighting strategy) +### RED family + -### RED-Drop +**RED-Drop:** ``` algorithm: @@ -206,7 +194,7 @@ algorithm: ``` -### RED-Weight +**RED-Weight:** ``` algorithm: @@ -219,3 +207,17 @@ algorithm: advantage_fn_args: std_normalize: false ``` + +## Citation + +```bibtex +@misc{yao2025grouprelativereinforcesecretlyoffpolicy, + title={Group-Relative REINFORCE Is Secretly an Off-Policy Algorithm: Demystifying Some Myths About GRPO and Its Friends}, + author={Chaorui Yao and Yanxi Chen and Yuchang Sun and Yushuo Chen and Wenhao Zhang and Xuchen Pan and Yaliang Li and Bolin Ding}, + year={2025}, + eprint={2509.24203}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/2509.24203}, +} +``` diff --git a/examples/rec_gsm8k/gsm8k.yaml b/examples/rec_gsm8k/gsm8k.yaml index 3be850e401..570136929b 100644 --- a/examples/rec_gsm8k/gsm8k.yaml +++ b/examples/rec_gsm8k/gsm8k.yaml @@ -1,4 +1,3 @@ -# Configuration file for the REC GSM8k project. project: "Trinity-RFT-GSM8K" name: rec_gsm8k checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints} @@ -15,7 +14,7 @@ algorithm: policy_loss_fn_args: epsilon_low: 0.2 epsilon_high: 0.2 - clip_mode: "none" + clip_mode: "one-side" weight: "none" temp: 1.0 regularizer: "none" diff --git a/examples/rec_math/README.md b/examples/rec_math/README.md deleted file mode 100644 index 8cc79050b8..0000000000 --- a/examples/rec_math/README.md +++ /dev/null @@ -1,221 +0,0 @@ -# Example: REC on MATH dataset - -This example shows the usage of REC on the [MATH dataset](https://huggingface.co/datasets/nlile/hendrycks-MATH-benchmark). - -For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_reasoning_basic.md). - -The config file is located in [`math.yaml`](math.yaml). - -# Group-relative REINFORCE Families -This folder provides **example configurations** for running different group-relative REINFORCE families within Trinity-RFT. - -It includes three major families: - -- **REC family** (clipping + importance sampling) -- **REP family** (regularization-based variants) -- **RED family** (data-distribution shaping strategies) - -We also provide baseline implementations such as **Vanilla REINFORCE** and **GRPO**. - -All algorithms are instantiated through modular YAML configs for easy reproduction and extension. - -# Summary Table 📝 - -| Family | Variants | Key Idea | -| ------------- | ----------------------------------------------- | ----------------------------------- | -| **Baselines** | REINFORCE, GRPO | Standard references | -| **REC** | OneSide-NoIS, OneSide-IS, TwoSide-IS, Ring-NoIS | Clipping + importance sampling | -| **REP** | AsymRE, OPMD | Regularization | -| **RED** | Drop, Weight | Data-distribution shaping | - - - -# Instantiations - -## Baselines - -### REINFORCE -Vanilla REINFORCE with group mean as baseline. - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - clip_mode: "none" # no clipping - weight: "none" # uniform weighting for samples - temp: 1.0 - regularizer: "none" # no regularizer - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: false -``` - -### GRPO -GRPO implemented with zero KL regularizer. Regularization can be enabled via `kl_loss_fn` and `kl_loss_fn_args`. - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - clip_mode: "one-side" - weight: "importance_sampling" - temp: 1.0 - regularizer: "none" - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: true - kl_loss_fn: 'k2' - kl_loss_fn_args: - kl_coef: 0.0 - -``` - -## REC family -Variants of clipping and importance-sampling strategies. -- REC-OneSide-NoIS -- REC-OneSide-IS -- REC-TwoSide-IS -- REC-Ring-NoIS - -### REC-OneSide-NoIS - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - clip_mode: "one-side" - weight: "none" - temp: 1.0 - regularizer: "none" - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: false -``` - -### REC-OneSide-IS - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - clip_mode: "one-side" - weight: "importance_sampling" - temp: 1.0 - regularizer: "none" - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: false -``` - -### REC-TwoSide-IS - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - clip_mode: "two-side" - weight: "importance_sampling" - temp: 1.0 - regularizer: "none" - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: false -``` -### REC-Ring-NoIS - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - epsilon_low_prime: 0.6 - epsilon_high_prime: 2.0 - clip_mode: "ring" - weight: "none" - temp: 1.0 - regularizer: "none" - regularizer_coef: 0.0 - advantage_fn_args: - std_normalize: false -``` - -## REP family - -Regularization-based algorithms. -- AsymRE (forward KL regularization) -- Kimi’s OPMD (k2 regularizer) - -### AsymRE - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - clip_mode: "none" - weight: "none" - temp: 1.0 - regularizer: "forward-kl" - regularizer_coef: 0.1 - advantage_fn_args: - std_normalize: false -``` - - -### Kimi's OPMD - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - clip_mode: "none" - weight: "none" - regularizer: "k2" - regularizer_coef: 0.1 - advantage_fn_args: - std_normalize: false -``` - -## RED family -Data-distribution shaping variants. -- RED-Drop (drop extra negative examples to balance the positive examples v.s. negative examples) -- RED-Weight (advantage-weighting strategy) - -### RED-Drop - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - clip_mode: "none" - weight: "none" - regularizer: "none" - advantage_fn_args: - std_normalize: false - drop: "balance" -``` - - -### RED-Weight - -``` -algorithm: - algorithm_type: rec - policy_loss_fn_args: - clip_mode: "none" - weight: "advantage" - regularizer: "none" - temp: 1.0 - advantage_fn_args: - std_normalize: false -``` diff --git a/examples/rec_math/math.yaml b/examples/rec_math/math.yaml deleted file mode 100644 index 226fbe7d75..0000000000 --- a/examples/rec_math/math.yaml +++ /dev/null @@ -1,82 +0,0 @@ -project: Trinity-RFT-rec_math -name: rec_math -checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints} -mode: both -model: - model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-3B-Instruct} - max_response_tokens: 2048 - max_model_len: 2048 -algorithm: - algorithm_type: rec - repeat_times: 8 - optimizer: - lr: 6e-8 - policy_loss_fn_args: - epsilon_low: 0.2 - epsilon_high: 0.2 - epsilon_high_prime: 0.4 - epsilon_low_prime: 0.4 - clip_mode: none - weight: none - advantage_fn_args: - std_normalize: false -cluster: - node_num: 1 - gpu_per_node: 8 -buffer: - total_steps: 200 - batch_size: 16 - explorer_input: - taskset: - name: math - storage_type: file - path: ${oc.env:TRINITY_TASKSET_PATH} - format: - prompt_key: problem - response_key: solution - rollout_args: - temperature: 1.0 - top_p: 1.0 - logprobs: 0 - eval_tasksets: - - name: math - storage_type: file - path: ${oc.env:TRINITY_EVAL_TASKSET_PATH} - split: test - format: - prompt_key: problem - response_key: solution - rollout_args: - temperature: 0.1 - top_p: 0.95 - default_workflow_type: math_boxed_workflow - default_reward_fn_type: math_boxed_reward - trainer_input: - experience_buffer: - name: math_buffer - storage_type: queue -explorer: - eval_interval: 500 - runner_per_model: 16 - rollout_model: - engine_type: vllm_async - engine_num: 4 - tensor_parallel_size: 1 - enable_prefix_caching: false - enforce_eager: true - dtype: bfloat16 - max_prompt_tokens: 1024 - max_response_tokens: 2048 - seed: 42 -synchronizer: - sync_method: nccl - sync_interval: 1 - sync_timeout: 3600 - sync_offset: 0 -trainer: - trainer_type: verl - save_interval: 100 - grad_clip: 1.0 - use_dynamic_bsz: true - max_token_len_per_gpu: 16384 - ulysses_sequence_parallel_size: 1