-
Notifications
You must be signed in to change notification settings - Fork 130
/
eval_frames_benchmark.py
153 lines (130 loc) · 5.72 KB
/
eval_frames_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import argparse
import json
import os
import time
from typing import List, Dict
from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8000/v1")
# client = OpenAI()
SLEEP_INTERVAL = 300
def load_existing_results(filename: str) -> List[Dict]:
try:
with open(filename, 'r') as f:
return json.load(f)
except FileNotFoundError:
return []
def save_result(filename: str, result: Dict):
results = load_existing_results(filename)
results.append(result)
with open(filename, 'w') as f:
json.dump(results, f, indent=2)
def get_last_processed_index(results: List[Dict]) -> int:
if not results:
return -1
return max(int(r.get('index', -1)) for r in results)
def generate_llm_prompt(prompt: str, wiki_links: List[str]) -> str:
return f"Here are the relevant Wikipedia articles:\n{wiki_links}\n\nBased on all the information, answer the query. \n\nQuery: {prompt}\n\n"
def get_llm_response(prompt: str, model: str) -> str:
response = client.with_options(timeout=1000.0).chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
max_tokens=1000,
n=1,
stop=None,
temperature=0.7,
extra_body={"optillm_approach": "readurls&memory"}
)
return response.choices[0].message.content.strip()
def evaluate_response(question: str, llm_response: str, ground_truth: str, model: str) -> Dict[str, str]:
evaluation_prompt = f"""===Task===
I need your help in evaluating an answer provided by an LLM against a ground
truth answer. Your task is to determine if the ground truth answer is present in the LLM's
response. Please analyze the provided data and make a decision.
===Instructions===
1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
2. Consider the substance of the answers - look for equivalent information or correct answers.
Do not focus on exact wording unless the exact wording is crucial to the meaning.
3. Your final decision should be based on whether the meaning and the vital facts of the
"Ground Truth Answer" are present in the "Predicted Answer:"
===Input Data===
- Question: {question}
- Predicted Answer: {llm_response}
- Ground Truth Answer: {ground_truth}
===Output Format===
Provide your final evaluation in the following format:
"Explanation:" (How you made the decision?)
"Decision:" ("TRUE" or "FALSE" )
Please proceed with the evaluation."""
evaluation_response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": evaluation_prompt}
],
max_tokens=300,
n=1,
stop=None,
temperature=0.3,
)
evaluation_text = evaluation_response.choices[0].message.content.strip()
# Extract the decision and explanation
lines = evaluation_text.split('\n')
decision = "FALSE"
explanation = ""
for line in lines:
if line.startswith("Decision:"):
decision = line.split(":")[1].strip().upper()
elif line.startswith("Explanation:"):
explanation = line.split(":", 1)[1].strip()
return {"decision": decision, "explanation": explanation}
def main(model: str):
# Load the dataset
dataset = load_dataset("google/frames-benchmark", split="test")
filename = f"evaluation_results_{model.replace('/', '_')}.json"
existing_results = load_existing_results(filename)
last_processed_index = get_last_processed_index(existing_results)
for item in tqdm(dataset, desc="Processing samples"):
index = int(item['Unnamed: 0'])
if index <= last_processed_index:
continue
prompt = generate_llm_prompt(item['Prompt'], item['wiki_links'])
llm_response = get_llm_response(prompt, model)
evaluation = evaluate_response(item['Prompt'], llm_response, item['Answer'], model)
result = {
"index": index,
"prompt": item['Prompt'],
"ground_truth": item['Answer'],
"llm_response": llm_response,
"evaluation_decision": evaluation['decision'],
"evaluation_explanation": evaluation['explanation'],
"reasoning_type": item['reasoning_types']
}
save_result(filename, result)
# print(f"Index: {index}, Decision: {result['evaluation_decision']}")
# time.sleep(SLEEP_INTERVAL)
# Calculate and print summary statistics
results = load_existing_results(filename)
total_samples = len(results)
correct_answers = sum(1 for r in results if r['evaluation_decision'] == 'TRUE')
accuracy = correct_answers / total_samples
print(f"Model: {model}")
print(f"Total samples: {total_samples}")
print(f"Correct answers: {correct_answers}")
print(f"Accuracy: {accuracy:.2%}")
# Print accuracy by reasoning type
reasoning_types = set(r['reasoning_type'] for r in results)
for rt in reasoning_types:
rt_samples = [r for r in results if r['reasoning_type'] == rt]
rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
rt_accuracy = rt_correct / len(rt_samples)
print(f"Accuracy for {rt}: {rt_accuracy:.2%}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate LLM performance on google/frames-benchmark")
parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4o, gpt-4o-mini)")
args = parser.parse_args()
main(args.model)