diff --git a/examples/grpo_math/README.md b/examples/grpo_math/README.md index 5b3c2c3ea2..fd6d2679e2 100644 --- a/examples/grpo_math/README.md +++ b/examples/grpo_math/README.md @@ -1,6 +1,14 @@ -# Example: PPO on MATH dataset +# Example: GRPO on MATH dataset + +This example shows the usage of [RM-Gallery](https://github.com/modelscope/RM-Gallery/) by running GRPO on a MATH dataset. You need to install RM-Gallery first. +The dataset is organized as: + +```jsonl + +{"question": "what is 2+2?", "gt_answer": 4} +{"question": "what is 2+3?", "gt_answer": 5} +``` -This example shows the usage of PPO on the MATH dataset, adapted from [simpleRL](https://github.com/hkust-nlp/simpleRL-reason/tree/v0). For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_reasoning_basic.md). diff --git a/trinity/common/rewards/reward_fn.py b/trinity/common/rewards/reward_fn.py index c6fe5bb58a..f2c58b70a5 100644 --- a/trinity/common/rewards/reward_fn.py +++ b/trinity/common/rewards/reward_fn.py @@ -69,7 +69,7 @@ def _build_sample_from_experience( ] sample = DataSample( - unique_id=experience.unique_id, + unique_id=experience.eid.uid, input=to_rm_gallery_messages(messages), output=output, metadata=experience.info,