-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
feat: add a way to evaluate llms for our usecase against our prompts #1700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
566b0bf
add eval
retrogtx b44a464
add evalite and autoevals scripts to package.json
retrogtx 840b61a
Merge branch 'staging' into eval
retrogtx 2acecf6
update packages
retrogtx c95865b
add custom timeout
retrogtx 6b88b39
add context using our pre defined prompts to increase scores, add son…
retrogtx 14abaa1
error handling
retrogtx 3245d0b
utilize dynamic test case generation for gmail search queries and oth…
retrogtx File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,240 @@ | ||
| import { evalite } from "evalite"; | ||
| import { openai } from "@ai-sdk/openai"; | ||
| import { streamText } from "ai"; | ||
| import { traceAISDKModel } from "evalite/ai-sdk"; | ||
| import { Factuality, Levenshtein } from "autoevals"; | ||
| import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts"; | ||
| import { generateObject } from "ai"; | ||
| import { z } from "zod"; | ||
|
|
||
| // base model (untraced) for internal helpers to avoid trace errors | ||
| // add ur own model here | ||
| const baseModel = openai("gpt-4o-mini"); | ||
|
|
||
| // traced model for the actual task under test | ||
| const model = traceAISDKModel(baseModel); | ||
|
|
||
| // error handling incase llm fails | ||
| const safeStreamText = async (config: Parameters<typeof streamText>[0]) => { | ||
| try { | ||
| const res = await streamText(config); | ||
| return res.textStream; | ||
| } catch (err) { | ||
| console.error("LLM call failed", err); | ||
| return "ERROR"; | ||
| } | ||
| }; | ||
|
|
||
| /** | ||
| * basic tests to cover all major capabilities, avg score is 30%, anything above is goated: | ||
| * - mail search and filtering | ||
| * - label management and organization | ||
| * - bulk operations (archive, delete, mark read/unread) | ||
| * - email composition and sending | ||
| * - smart categorization (subscriptions, newsletters, meetings) | ||
| * - web search integration | ||
| * - user interaction patterns | ||
| */ | ||
|
|
||
|
|
||
| // forever todo: make the expected output autistically specific | ||
retrogtx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Dynamically builds a list of natural-language queries and their minimal expected Gmail-syntax | ||
| const buildGmailSearchTestCases = async (): Promise<{ input: string; expected: string }[]> => { | ||
| const { object } = await generateObject({ | ||
| model: baseModel, | ||
| system: `You are a JSON test-case generator for Gmail search query conversions. | ||
| Return ONLY a JSON object with a single key "cases" mapping to an array. Each array element has exactly the keys {input, expected}. | ||
| Guidelines: | ||
| • input – natural-language requests about searching/filtering email. | ||
| • expected – a short Gmail-syntax fragment (e.g., "is:unread", "has:attachment", "after:") that MUST appear in a correct answer. | ||
| • Cover diverse filters: sender, subject, attachments, labels, dates, read/unread. | ||
| • Array length: 8-12. | ||
| • No comments or additional keys.`, | ||
| prompt: "Generate Gmail search conversion test cases", | ||
| schema: z.object({ | ||
| cases: z.array( | ||
| z.object({ | ||
| input: z.string().min(5), | ||
| expected: z.string().min(3), | ||
| }), | ||
| ), | ||
| }), | ||
| }); | ||
|
|
||
| return object.cases; | ||
| }; | ||
|
|
||
| // generic dynamic testcase builder | ||
|
|
||
| type TestCase = { input: string; expected: string }; | ||
|
|
||
| const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => { | ||
| return async () => { | ||
| const { object } = await generateObject({ | ||
| model: baseModel, | ||
| system: `You are a JSON test-case generator for the topic: ${topic}. | ||
| Return ONLY a JSON object with key "cases" whose value is an array of objects {input, expected}. | ||
| Guidelines: | ||
| • input – natural-language request related to ${topic}. | ||
| • expected – short keyword (≤3 words) expected in correct assistant reply. | ||
| • Array length: 6-10. | ||
| • No extra keys or comments.`, | ||
| prompt: `Generate ${topic} test cases`, | ||
| schema: z.object({ | ||
| cases: z.array( | ||
| z.object({ | ||
| input: z.string().min(5), | ||
| expected: z.string().min(2), | ||
| }), | ||
| ), | ||
| }), | ||
| }); | ||
|
|
||
| return object.cases; | ||
| }; | ||
| }; | ||
|
|
||
| evalite("AI Chat – Basic Responses", { | ||
| data: makeAiChatTestCaseBuilder("basic responses (greetings, capabilities, quick help)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("Gmail Search Query – Natural Language", { | ||
| data: buildGmailSearchTestCases, | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: GmailSearchAssistantSystemPrompt(), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Label Management", { | ||
| data: makeAiChatTestCaseBuilder("label management (create, delete, list, apply labels)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Email Organization", { | ||
| data: makeAiChatTestCaseBuilder("email organization (archive, mark read/unread, bulk actions)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Email Composition", { | ||
| data: makeAiChatTestCaseBuilder("email composition tasks (compose, reply, send, draft)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Smart Categorization", { | ||
| data: makeAiChatTestCaseBuilder("smart categorization (subscriptions, newsletters, meetings, bills)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Information Queries", { | ||
| data: makeAiChatTestCaseBuilder("information queries (summaries, web search, tax docs, recent activity)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Complex Workflows", { | ||
| data: makeAiChatTestCaseBuilder("complex workflows (multi-step actions, automation)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – User Intent Recognition", { | ||
| data: makeAiChatTestCaseBuilder("user intent recognition (help, overwhelm, search, cleanup)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("AI Chat – Error Handling & Edge Cases", { | ||
| data: makeAiChatTestCaseBuilder("error handling & edge cases (invalid, bulk actions, very old queries)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: AiChatPrompt("test-thread-id", "inbox", ""), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("Gmail Search Query Building", { | ||
| data: buildGmailSearchTestCases, | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: GmailSearchAssistantSystemPrompt(), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
|
|
||
| evalite("Email Composition with Style Matching", { | ||
| data: makeAiChatTestCaseBuilder("styled email composition (follow-up, thank you, meeting, apology)"), | ||
| task: async (input) => { | ||
| return safeStreamText({ | ||
| model: model, | ||
| system: StyledEmailAssistantSystemPrompt(), | ||
| prompt: input, | ||
| }); | ||
| }, | ||
| scorers: [Factuality, Levenshtein], | ||
| }); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| { | ||
| "extends": "@zero/tsconfig/base", | ||
| "include": ["src/**/*.ts", "src/overrides.d.ts", "worker-configuration.d.ts", "drizzle.config.ts"] | ||
| "include": ["src/**/*.ts", "src/overrides.d.ts", "worker-configuration.d.ts", "drizzle.config.ts", "tests/**/*.ts", "evals/**/*.ts"] | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| import { defineConfig } from "vite"; | ||
|
|
||
| export default defineConfig({ | ||
| test: { | ||
| testTimeout: 120000, | ||
| hookTimeout: 120000, | ||
| teardownTimeout: 120000, | ||
| }, | ||
| }); | ||
retrogtx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.