-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathevaluate_benchmark.py
149 lines (126 loc) · 5.12 KB
/
evaluate_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import json
import os
import argparse
import jsonlines
from tqdm import tqdm
from prompt import evaluate_system, evaluate_prompt
from evaluator import ClaudeAgent, CriticAgent
EVAL_TIMES = 1
class EvalAgent(object):
def __init__(self, agent):
self.agent = agent
def success_check_fn_score(self, response):
try:
result = json.loads(response.strip('json|```'))
except json.JSONDecodeError as e:
print("JSON decode error:", e)
return False
valid_score_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
if "score" not in result or "reason" not in result:
print("Missing 'score' or 'reason' in the result")
return False
if result["score"] not in valid_score_values:
return False
if not isinstance(result["reason"], str):
return False
return True
def generate_score(self, content, query, criteria):
prompt_data = {
"query": query,
"response": content["response"],
"criteria": criteria,
}
retry = 0
success = False
while not success and retry < 3:
prompt = evaluate_prompt.format(**prompt_data)
response, success = self.agent.run(
prompt=prompt,
success_check_fn=self.success_check_fn_score
)
try:
response = json.loads(response.strip('json|```'))
except json.JSONDecodeError as e:
print("JSON decode error:", e)
response = eval(response.strip('json|```'))
retry += 1
if success:
return response
else:
raise ValueError("Fail to generate score!")
def save_output(output, file_name):
"""
Saves output data to a specified file in JSONL format.
"""
with open(file_name, 'a', encoding='utf-8') as f:
for record in output:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
def load_file(file_name):
"""
Loads JSONL lines from a file into a list of dictionaries.
"""
if os.path.isfile(file_name):
with open(file_name, 'r', encoding='utf-8') as f:
records = [json.loads(line) for line in f]
return records, len(records)
return [], 0
def load_query_criteria(jsonl_file_path):
"""
Loads criteria from a JSONL file into a dictionary.
"""
data_list = {}
with jsonlines.open(jsonl_file_path) as reader:
for obj in reader:
data_list[obj['index']] = {}
data_list[obj['index']]['query'] = obj['query']
data_list[obj['index']]['criteria'] = obj['criteria']
return data_list
def process(agent, input_file, out_file, id_query_criteria_map):
"""
Processes input files through the evaluation agent, producing scores and saving results.
"""
records, existing_count = load_file(out_file)
cnt = existing_count
contents, input_cnt = load_file(input_file)
with tqdm(total=input_cnt, initial=0, desc=f"Processing {input_file.split('/')[-1]}") as pbar:
for i, content in enumerate(contents):
if existing_count > 0 and i < existing_count - 1:
pbar.update()
continue
data = {
"index": content["index"],
"scores": {}
}
query = id_query_criteria_map[content["index"]]['query']
criteria = id_query_criteria_map[content["index"]]['criteria']
with tqdm(total=len(criteria) * EVAL_TIMES, desc=f"Data ID {content['index']} Progress", leave=False) as internal_pbar:
for c in criteria:
if c["name"] not in criteria:
data["scores"][c["name"]] = []
while len(data["scores"][c["name"]]) < EVAL_TIMES:
score = agent.generate_score(content, query, c)
data["scores"][c["name"]].append(score)
internal_pbar.update(1)
save_output([data], out_file)
cnt += 1
pbar.update()
print(f"CNT: {cnt}")
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process lines from an input file.")
parser.add_argument("--evaluator", choices=['claude', 'critic'], required=True, help="Choose the scoring model to use: 'claude' or 'critic'.")
parser.add_argument("--query_criteria_file", type=str, help="Path to the query and criteria file.")
parser.add_argument("--input_file", type=str, help="Path to the input file.")
parser.add_argument("--output_file", type=str, help="Path to the output file.")
args = parser.parse_args()
# Evaluator initialization based on chosen model
if args.model == 'claude':
agent = EvalAgent(ClaudeAgent(
system_prompt=evaluate_system,
))
else:
agent = EvalAgent(CriticAgent(
system_prompt=evaluate_system,
))
id_query_criteria_map = load_query_criteria(args.query_criteria_file)
process(agent, args.input_file, args.output_file, id_query_criteria_map)