Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarking Different Response models #11

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions scripts/test_gsm8k_response_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from braintrust import Eval
from autoevals.value import ExactMatch
from datasets import load_dataset
from openai import AsyncOpenAI
from pydantic import BaseModel
import instructor
from asyncio import run
from uuid import uuid4
import json

dataset = load_dataset("567-labs/gsm8k")
oai = AsyncOpenAI()


class Answer(BaseModel):
chain_of_thought: str
answer: int


class AnswerWithCalculations(BaseModel):
chain_of_thought: str
required_calculations: list[str]
answer: int


class AssumptionBasedAnswer(BaseModel):
assumptions: list[str]
logic_flow: str
answer: int


class ErrorAwareCalculation(BaseModel):
key_steps: list[str]
potential_pitfalls: list[str]
intermediate_results: list[str]
answer: int


mode = instructor.Mode.JSON
client = instructor.from_openai(oai, mode=mode)

response_models = [
Answer,
AnswerWithCalculations,
AssumptionBasedAnswer,
ErrorAwareCalculation,
]


async def main():
uuid = uuid4()
print(f"Running eval with uuid: {uuid}")

full_dataset = list(load_dataset("567-labs/gsm8k", split="test"))
dataset = full_dataset[:200]
for response_model in response_models:

async def task(question, hooks):
resp = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response",
},
{"role": "user", "content": question},
],
response_model=response_model,
)

hooks.meta(
response_model_name=response_model.__name__,
response_model=json.dumps(response_model.model_json_schema()),
response=resp.model_dump_json(),
)
return resp.answer

await Eval(
name="567-labs/gsm8k",
experiment_name=f"gsm8k-{response_model.__name__}-{uuid}",
data=lambda: [
{
"input": row["question"],
"expected": row["answer"],
}
for row in dataset
], # Replace with your eval dataset
task=task,
scores=[ExactMatch],
metadata={
"model": "gpt-4o-mini",
"n_samples": len(dataset),
"response_model": response_model.__name__,
"mode": mode.value,
},
max_concurrency=10,
)


run(main())