Skip to content

Commit e622196

Browse files
authored
[Doc] Drop dummy reward and dataset for DeepMath-103K and accuracy reward (#4524)
1 parent 1b1242c commit e622196

File tree

8 files changed

+151
-143
lines changed

8 files changed

+151
-143
lines changed

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,13 @@ trainer.train()
9292
```python
9393
from datasets import load_dataset
9494
from trl import GRPOTrainer
95+
from trl.rewards import accuracy_reward
9596

96-
dataset = load_dataset("trl-lib/tldr", split="train")
97-
98-
# Dummy reward function: count the number of unique characters in the completions
99-
def reward_num_unique_chars(completions, **kwargs):
100-
return [len(set(c)) for c in completions]
97+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
10198

10299
trainer = GRPOTrainer(
103100
model="Qwen/Qwen2-0.5B-Instruct",
104-
reward_funcs=reward_num_unique_chars,
101+
reward_funcs=accuracy_reward,
105102
train_dataset=dataset,
106103
)
107104
trainer.train()

docs/source/grpo_trainer.md

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ This post-training method was contributed by [Quentin Gallouédec](https://huggi
1414

1515
## Quick start
1616

17-
This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
17+
This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:
1818

1919
<iframe
20-
src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
20+
src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
2121
frameborder="0"
2222
width="100%"
2323
height="560px"
@@ -28,21 +28,14 @@ Below is the script to train the model.
2828
```python
2929
# train_grpo.py
3030
from datasets import load_dataset
31-
from trl import GRPOConfig, GRPOTrainer
32-
33-
dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
31+
from trl import GRPOTrainer
32+
from trl.rewards import accuracy_reward
3433

35-
# Dummy reward function for demonstration purposes
36-
def reward_num_unique_letters(completions, **kwargs):
37-
"""Reward function that rewards completions with more unique letters."""
38-
completion_contents = [completion[0]["content"] for completion in completions]
39-
return [float(len(set(content))) for content in completion_contents]
34+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
4035

41-
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO")
4236
trainer = GRPOTrainer(
4337
model="Qwen/Qwen2-0.5B-Instruct",
44-
reward_funcs=reward_num_unique_letters,
45-
args=training_args,
38+
reward_funcs=accuracy_reward,
4639
train_dataset=dataset,
4740
)
4841
trainer.train()
@@ -290,29 +283,27 @@ import argparse
290283

291284
from datasets import load_dataset
292285
from trl import GRPOTrainer, GRPOConfig
286+
from trl.rewards import accuracy_reward
293287

294288
def main():
295289
parser = argparse.ArgumentParser()
296290
parser.add_argument("--vllm_server_host", type=str, default="", help="The server IP")
297291
args = parser.parse_args()
298292

299-
# Example dataset from TLDR
300-
dataset = load_dataset("trl-lib/tldr", split="train")
301-
302-
# Dummy reward function: count the number of unique characters in the completions
303-
def reward_num_unique_chars(completions, **kwargs):
304-
return [len(set(c)) for c in completions]
293+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
305294

306295
training_args = GRPOConfig(
307-
output_dir="Qwen2.5-72B-GRPO",
308296
per_device_train_batch_size=4,
309-
bf16=True,
310-
gradient_checkpointing=True,
311297
use_vllm=True,
312298
vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."), # from ip-X-X-X-X to X.X.X.X
313299
)
314300

315-
trainer = GRPOTrainer(model="Qwen/Qwen2.5-72B", args=training_args, reward_funcs=reward_num_unique_chars, train_dataset=dataset)
301+
trainer = GRPOTrainer(
302+
model="Qwen/Qwen2.5-72B",
303+
args=training_args,
304+
reward_funcs=accuracy_reward,
305+
train_dataset=dataset
306+
)
316307
trainer.train()
317308

318309
if __name__=="__main__":

docs/source/quickstart.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,12 @@ trainer.train()
2424
```python
2525
from trl import GRPOTrainer
2626
from datasets import load_dataset
27-
28-
# Define a simple reward function (count unique chars as example)
29-
def reward_function(completions, **kwargs):
30-
return [len(set(completion.lower())) for completion in completions]
27+
from trl.rewards import accuracy_reward
3128

3229
trainer = GRPOTrainer(
3330
model="Qwen/Qwen2.5-0.5B-Instruct", # Start from SFT model
34-
train_dataset=load_dataset("trl-lib/tldr", split="train"),
35-
reward_funcs=reward_function,
31+
train_dataset=load_dataset("trl-lib/DeepMath-103K", split="train"),
32+
reward_funcs=accuracy_reward,
3633
)
3734
trainer.train()
3835
```

docs/source/rloo_trainer.md

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ This post-training method was contributed by [Costa Huang](https://github.com/vw
1515

1616
## Quick start
1717

18-
This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:
18+
This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [DeepMath-103K dataset](https://huggingface.co/datasets/trl-lib/DeepMath-103K). You can view the data in the dataset here:
1919

2020
<iframe
21-
src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
21+
src="https://huggingface.co/datasets/trl-lib/DeepMath-103K/embed/viewer/default/train?row=0"
2222
frameborder="0"
2323
width="100%"
2424
height="560px"
@@ -29,21 +29,14 @@ Below is the script to train the model.
2929
```python
3030
# train_rloo.py
3131
from datasets import load_dataset
32-
from trl import RLOOConfig, RLOOTrainer
33-
34-
dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
32+
from trl import RLOOTrainer
33+
from trl.rewards import accuracy_reward
3534

36-
# Dummy reward function for demonstration purposes
37-
def reward_num_unique_letters(completions, **kwargs):
38-
"""Reward function that rewards completions with more unique letters."""
39-
completion_contents = [completion[0]["content"] for completion in completions]
40-
return [float(len(set(content))) for content in completion_contents]
35+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
4136

42-
training_args = RLOOConfig(output_dir="Qwen2-0.5B-RLOO")
4337
trainer = RLOOTrainer(
4438
model="Qwen/Qwen2-0.5B-Instruct",
45-
reward_funcs=reward_num_unique_letters,
46-
args=training_args,
39+
reward_funcs=accuracy_reward,
4740
train_dataset=dataset,
4841
)
4942
trainer.train()

docs/source/vllm_integration.md

Lines changed: 21 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -46,24 +46,14 @@ Sample of a simple `train.py` script:
4646
```python
4747
from datasets import load_dataset
4848
from trl import GRPOTrainer, GRPOConfig
49+
from trl.rewards import accuracy_reward
4950

50-
dataset = load_dataset("trl-lib/tldr", split="train")
51-
52-
# Dummy reward function: count the number of unique characters in the completions
53-
def reward_num_unique_chars(completions, **kwargs):
54-
return [len(set(c)) for c in completions]
55-
56-
training_args = GRPOConfig(
57-
output_dir="my_test",
58-
use_vllm=True,
59-
bf16=True,
60-
gradient_checkpointing=True,
61-
)
51+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
6252

6353
trainer = GRPOTrainer(
6454
model="Qwen/Qwen2.5-7B",
65-
args=training_args,
66-
reward_funcs=reward_num_unique_chars,
55+
args=GRPOConfig(use_vllm=True),
56+
reward_funcs=accuracy_reward,
6757
train_dataset=dataset,
6858
)
6959

@@ -76,24 +66,14 @@ trainer.train()
7666
```python
7767
from datasets import load_dataset
7868
from trl import OnlineDPOTrainer, OnlineDPOConfig
69+
from trl.rewards import accuracy_reward
7970

80-
dataset = load_dataset("trl-lib/tldr", split="train")
81-
82-
# Dummy reward function: count the number of unique characters in the completions
83-
def reward_num_unique_chars(completions, **kwargs):
84-
return [len(set(c)) for c in completions]
85-
86-
training_args = OnlineDPOConfig(
87-
output_dir="my_test",
88-
use_vllm=True,
89-
bf16=True,
90-
gradient_checkpointing=True,
91-
)
71+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
9272

9373
trainer = OnlineDPOTrainer(
9474
model="Qwen/Qwen2.5-7B",
95-
args=training_args,
96-
reward_funcs=reward_num_unique_chars,
75+
args=OnlineDPOConfig(use_vllm=True),
76+
reward_funcs=accuracy_reward,
9777
train_dataset=dataset,
9878
)
9979

@@ -106,24 +86,14 @@ trainer.train()
10686
```python
10787
from datasets import load_dataset
10888
from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
89+
from trl.rewards import accuracy_reward
10990

110-
dataset = load_dataset("trl-lib/tldr", split="train")
111-
112-
# Dummy reward function: count the number of unique characters in the completions
113-
def reward_num_unique_chars(completions, **kwargs):
114-
return [len(set(c)) for c in completions]
115-
116-
training_args = NashMDConfig(
117-
output_dir="my_test",
118-
use_vllm=True,
119-
bf16=True,
120-
gradient_checkpointing=True,
121-
)
91+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
12292

12393
trainer = NashMDTrainer(
12494
model="Qwen/Qwen2.5-7B",
125-
args=training_args,
126-
reward_funcs=reward_num_unique_chars,
95+
args=NashMDConfig(use_vllm=True),
96+
reward_funcs=accuracy_reward,
12797
train_dataset=dataset,
12898
)
12999

@@ -135,25 +105,15 @@ trainer.train()
135105

136106
```python
137107
from datasets import load_dataset
138-
from trl.experimental.xpo import XPOTrainer, XPOConfig
139-
140-
dataset = load_dataset("trl-lib/tldr", split="train")
108+
from trl import XPOTrainer, XPOConfig
109+
from trl.rewards import accuracy_reward
141110

142-
# Dummy reward function: count the number of unique characters in the completions
143-
def reward_num_unique_chars(completions, **kwargs):
144-
return [len(set(c)) for c in completions]
145-
146-
training_args = XPOConfig(
147-
output_dir="my_test",
148-
use_vllm=True,
149-
bf16=True,
150-
gradient_checkpointing=True,
151-
)
111+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
152112

153113
trainer = XPOTrainer(
154114
model="Qwen/Qwen2.5-7B",
155-
args=training_args,
156-
reward_funcs=reward_num_unique_chars,
115+
args=XPOConfig(use_vllm=True),
116+
reward_funcs=accuracy_reward,
157117
train_dataset=dataset,
158118
)
159119

@@ -166,24 +126,14 @@ trainer.train()
166126
```python
167127
from datasets import load_dataset
168128
from trl import RLOOTrainer, RLOOConfig
129+
from trl.rewards import accuracy_reward
169130

170-
dataset = load_dataset("trl-lib/tldr", split="train")
171-
172-
# Dummy reward function: count the number of unique characters in the completions
173-
def reward_num_unique_chars(completions, **kwargs):
174-
return [len(set(c)) for c in completions]
175-
176-
training_args = RLOOConfig(
177-
output_dir="my_test",
178-
use_vllm=True,
179-
bf16=True,
180-
gradient_checkpointing=True,
181-
)
131+
dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
182132

183133
trainer = RLOOTrainer(
184134
model="Qwen/Qwen2.5-7B",
185-
args=training_args,
186-
reward_funcs=reward_num_unique_chars,
135+
args=RLOOConfig(use_vllm=True),
136+
reward_funcs=accuracy_reward,
187137
train_dataset=dataset,
188138
)
189139

0 commit comments

Comments
 (0)