-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathcountdown_task.py
166 lines (137 loc) · 5.1 KB
/
countdown_task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import pandas as pd
from torch.utils.data import Dataset
from data_types import MiniBatch
from tokenizer import Tokenizer
SYSTEM_MESSAGE = (
"You are a helpful assistant. You first think about the reasoning process "
"in your mind and then provide the user with the answer."
)
USER_TEMPLATE = (
"Using the numbers {numbers}, create an equation that equals {target}. "
"You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. "
"Show your work in <think> </think> tags. "
"And return the final answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 </answer>."
)
RESPONSE_PROMPT = "Let me solve this step by step.\n<think>"
class CountdownTasksDataset(Dataset):
"""Prepare Countdown Tasks for training"""
def __init__(
self,
tokenizer: Tokenizer,
data_path: str,
split: str = "train",
test_size: int = 100,
):
data = pd.read_parquet(Path(data_path) / "data")
# use the last `test_size` examples for testing
self.data = (
data.iloc[:-test_size] if split == "train" else data.iloc[-test_size:]
)
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data.iloc[idx].to_dict()
item.update(self.encode_prefix(item["nums"], item["target"]))
return item
def encode_prefix(self, numbers: List[int], target: int):
"""Prefix is the *actual* input to the model."""
user_message = USER_TEMPLATE.format(numbers=numbers, target=target)
prefix = self.tokenizer.encode_chat_with_response_prompt(
[
{"role": "system", "content": SYSTEM_MESSAGE},
{"role": "user", "content": user_message},
],
RESPONSE_PROMPT,
)
tokens = self.tokenizer.tokenize(prefix)
return {
"prefix": prefix,
"prefix_tokens": tokens.tokens,
"prefix_token_ids": tokens.ids,
}
@staticmethod
def collate_fn(batch: List[Dict[str, Any]]) -> MiniBatch:
"""Collate examples into a batch."""
numbers = [item["nums"] for item in batch]
target = [item["target"] for item in batch]
prefix = [item["prefix"] for item in batch]
prefix_tokens = [item["prefix_tokens"] for item in batch]
prefix_token_ids = [item["prefix_token_ids"] for item in batch]
return MiniBatch(
numbers=numbers,
target=target,
prefix=prefix,
prefix_tokens=prefix_tokens,
prefix_token_ids=prefix_token_ids,
)
def format_reward_function(response: str, end_token: Optional[str] = None) -> float:
"""
Checks if the response follows the format <think>...</think><answer>...</answer>
"""
# Strip end token if present
if end_token and response.endswith(end_token):
response = response[: -len(end_token)]
think_regex = r"<think>.*?<\/think>"
answer_regex = r"<answer>.*?<\/answer>"
full_format_regex = r"^<think>.*?<\/think>\n<answer>.*?<\/answer>$"
think_match = re.search(think_regex, response, re.DOTALL)
answer_match = re.search(answer_regex, response, re.DOTALL)
full_format_match = re.match(full_format_regex, response, re.DOTALL)
if full_format_match:
return 1.0
reward = 0.0
if think_match:
reward += 0.1
if answer_match:
reward += 0.5
return reward
def answer_reward_function(
response: str, numbers: List[int] = None, target: int = None
) -> float:
"""
Checks if the answer uses all numbers exactly once and evaluates to the target
"""
answer_regex = r"<answer>(.*?)<\/answer>"
answer_match = re.search(answer_regex, response, re.DOTALL)
if not answer_match:
return 0.0
answer_content = answer_match.group(1)
if not answer_content:
return 0.0
allowed_chars = r"^[0-9+\-*/() ]+$"
if not re.match(allowed_chars, answer_content):
return 0.0
# Check if the answer uses all numbers exactly once
used_numbers = [int(n) for n in re.findall(r"\d+", answer_content)]
if sorted(used_numbers) != sorted(numbers):
return 0.0
# Check if the answer evaluates to the target
try:
result = eval(answer_content, {"__builtins__": None}, {})
if abs(float(result) - float(target)) < 1e-5:
return 1.0
except:
pass
return 0.0
def reward_function(
response: str,
numbers: List[int] = None,
target: int = None,
end_token: str = None,
) -> Dict[str, Any]:
"""Reward function for Countdown Tasks.
Total reward = 0.1 * format_reward + answer_reward
"""
format_reward = format_reward_function("<think>" + response, end_token)
answer_reward = answer_reward_function(response, numbers, target)
return {
"reward": format_reward * 0.1 + answer_reward,
"reward_info": {
"format_reward": format_reward,
"answer_reward": answer_reward,
},
}