diff --git a/README.md b/README.md index 9b732fed84..8cf36993aa 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,70 @@ Want to contribute? Read our [Contributing Guide](./CONTRIBUTING.md) ## Quadratic is hiring Check out our open roles ⟶ [careers.quadratichq.com](https://careers.quadratichq.com) + +# Quadratic QA Test Suite + +This repository contains automated tests for the Quadratic application using Playwright. + +## Prerequisites + +- Node.js (v14 or newer) +- npm + +## Setup + +1. Install dependencies: + ``` + npm install + ``` + +2. Install Playwright browsers: + ``` + npx playwright install + ``` + +## Running Tests + +Run all tests: +``` +npm test +``` + +Run tests with UI mode (for debugging and development): +``` +npm run test:ui +``` + +Run tests in headed mode (with visible browser): +``` +npm run test:headed +``` + +Run tests in debug mode: +``` +npm run test:debug +``` + +View the HTML report after a test run: +``` +npm run report +``` + +## Test Structure + +- `tests/homepage.spec.ts` - Basic tests for the homepage +- `tests/login.spec.ts` - Tests for login functionality +- `tests/spreadsheet.spec.ts` - Tests for spreadsheet functionality + +## Notes + +- These tests are designed for the Quadratic application at `app.quadratichq.com` +- The selectors in the tests may need to be updated based on the actual UI +- Screenshots are saved in the project root directory + +## Adding New Tests + +1. Create a new file in the `tests` directory with a `.spec.ts` extension +2. Import the necessary Playwright modules +3. Write your tests using the Playwright API +4. Run the tests to verify they work as expected diff --git a/package-lock.json b/package-lock.json index dab2d3e06c..a5e288b012 100644 --- a/package-lock.json +++ b/package-lock.json @@ -29,6 +29,7 @@ "chalk": "^5.3.0", "commander": "^11.1.0", "concurrently": "^6.5.1", + "dotenv": "^16.4.7", "eslint": "^8.57.0", "jest": "^29.6.1", "kill-port": "^2.0.1", @@ -19908,7 +19909,9 @@ } }, "node_modules/dotenv": { - "version": "16.4.4", + "version": "16.4.7", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz", + "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==", "license": "BSD-2-Clause", "engines": { "node": ">=12" diff --git a/package.json b/package.json index c33d674f5d..81d162611b 100644 --- a/package.json +++ b/package.json @@ -84,6 +84,7 @@ "chalk": "^5.3.0", "commander": "^11.1.0", "concurrently": "^6.5.1", + "dotenv": "^16.4.7", "eslint": "^8.57.0", "jest": "^29.6.1", "kill-port": "^2.0.1", diff --git a/quadratic-ai-eval/.gitignore b/quadratic-ai-eval/.gitignore new file mode 100644 index 0000000000..7897af19c6 --- /dev/null +++ b/quadratic-ai-eval/.gitignore @@ -0,0 +1,50 @@ +# Playwright specific +/test-results/ +/playwright-report/ +/blob-report/ +/playwright/.cache/ +/html-report/ +/allure-results/ +/allure-report/ + +# Node.js dependencies +/node_modules/ +/package-lock.json +/yarn.lock +/pnpm-lock.yaml + +# Environment variables +.env +.env.local +.env.development +.env.test +.env.production + +# IDE specific files +.idea/ +.vscode/ +*.code-workspace +.DS_Store + +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Coverage directory +/coverage/ + +# Temporary files +/tmp/ +/temp/ + +# Screenshots and videos from test runs +/screenshots/ +/videos/ + +# User-specific files +*.user +*.suo +*.userprefs \ No newline at end of file diff --git a/quadratic-ai-eval/README.md b/quadratic-ai-eval/README.md new file mode 100644 index 0000000000..576104e533 --- /dev/null +++ b/quadratic-ai-eval/README.md @@ -0,0 +1,121 @@ +# Quadratic AI Evaluation Framework + +This framework allows you to test Quadratic's AI capabilities by running multiple prompts in parallel and evaluating the results using Claude. + +## Setup + +1. Install dependencies: +```bash +npm install +``` + +2. Set up environment variables: +```bash +# Create a .env file with your Anthropic API key +echo "ANTHROPIC_API_KEY=your_api_key_here" > .env +``` + +## Running Tests + +To run all tests: +```bash +npx playwright test +``` + +To run the prompt evaluation tests specifically: +```bash +npx playwright test prompt-evaluation.spec.ts +# or use the npm script +npm run test:prompt +``` + +To run tests with a UI: +```bash +npx playwright test --ui +``` + +## Test Categories + +The framework includes different categories of tests: + +- **Basic prompts**: Simple data visualization prompts (run with `npm run test:basic`) +- **Complex data prompts**: More complex data analysis prompts (run with `npm run test:complex`) +- **Parallel execution**: Run all tests in parallel (run with `npm run test:parallel`) + +## Adding New Tests + +To add new test prompts, edit the `tests/prompt-tests.ts` file and add your prompts to the appropriate array: + +```typescript +// For basic data visualization prompts +export const testPrompts: PromptTest[] = [ + { + name: 'Your Test Name', + prompt: 'Your prompt text here', + validationCriteria: [ + 'Criterion 1?', + 'Criterion 2?', + 'Criterion 3?', + 'Criterion 4?', + 'Criterion 5?' + ], + expectedRating: 'GREEN' // or 'YELLOW' or 'RED' + }, + // Add more test prompts here +]; + +// For more complex data analysis prompts +export const complexDataPrompts: PromptTest[] = [ + // Add your complex data prompts here +]; +``` + +### Test Structure + +Each test prompt consists of: + +- `name`: A descriptive name for the test +- `prompt`: The actual prompt to send to Quadratic +- `validationCriteria`: An array of questions that Claude will use to evaluate the result +- `expectedRating`: The expected rating from Claude (GREEN, YELLOW, or RED) + +## How It Works + +1. The test framework logs in to Quadratic using Auth0 credentials +2. For each prompt in the test arrays: + - It navigates to the file creation page with the prompt + - Waits for the spreadsheet to be generated + - Takes a screenshot of the result + - Sends the screenshot to Claude for evaluation + - Validates that the result meets the expected criteria + +## Evaluation Criteria + +Claude evaluates each result and provides a rating: + +- **GREEN**: The result looks correct and fully satisfies the prompt requirements +- **YELLOW**: The result partially satisfies the prompt but has minor issues +- **RED**: The result is incorrect or has major issues + +## Test Reports + +Test reports are generated in the `playwright-report` directory. Each test includes: + +- The prompt text +- A screenshot of the result +- Claude's evaluation +- The test status (passed/failed) + +## Customizing Tests + +You can customize the test framework by: + +1. Adding new prompt collections in `prompt-tests.ts` +2. Modifying the evaluation criteria +3. Adjusting timeouts and other parameters in `config.ts` + +## Troubleshooting + +- If tests fail with authentication errors, check your Auth0 credentials +- If Claude evaluation fails, check your Anthropic API key +- If tests time out, you may need to increase the timeout values in the config file \ No newline at end of file diff --git a/quadratic-ai-eval/package.json b/quadratic-ai-eval/package.json new file mode 100644 index 0000000000..ead6cb8b01 --- /dev/null +++ b/quadratic-ai-eval/package.json @@ -0,0 +1,35 @@ +{ + "name": "quadratic-ai-eval", + "version": "1.0.0", + "description": "Playwright test suite for Quadratic with Auth0 authentication", + "main": "index.js", + "scripts": { + "test": "playwright test", + "test:ui": "playwright test --ui", + "test:headed": "playwright test --headed", + "test:debug": "playwright test --debug", + "test:dev": "playwright test --headed --debug-brk --timeout=0", + "test:basic": "playwright test -g 'Testing basic prompt'", + "test:complex": "playwright test -g 'Testing complex data prompt'", + "test:parallel": "playwright test --workers=4", + "test:prompt": "playwright test prompt-evaluation.spec.ts", + "report": "playwright show-report" + }, + "keywords": [ + "quadratic", + "testing", + "playwright", + "automation", + "auth0" + ], + "author": "", + "license": "ISC", + "devDependencies": { + "@playwright/test": "^1.40.0", + "dotenv": "^16.4.7" + }, + "dependencies": { + "@anthropic-ai/sdk": "^0.37.0", + "zod": "^3.24.2" + } +} \ No newline at end of file diff --git a/quadratic-ai-eval/playwright.config.ts b/quadratic-ai-eval/playwright.config.ts new file mode 100644 index 0000000000..1bfd113b81 --- /dev/null +++ b/quadratic-ai-eval/playwright.config.ts @@ -0,0 +1,101 @@ +import { defineConfig, devices } from '@playwright/test'; +import config from './tests/config'; + +/** + * Read environment variables from file. + * https://github.com/motdotla/dotenv + */ +require('dotenv').config(); + +/** + * See https://playwright.dev/docs/test-configuration. + */ +export default defineConfig({ + testDir: './tests', + /* Maximum time one test can run for. */ + timeout: config.timeouts.generation + 60000, // Generation timeout + 1 minute buffer + expect: { + /** + * Maximum time expect() should wait for the condition to be met. + * For example in `await expect(locator).toHaveText();` + */ + timeout: config.timeouts.elementVisibility + }, + /* Run tests in files in parallel */ + fullyParallel: true, // Enable parallel execution + /* Fail the build on CI if you accidentally left test.only in the source code. */ + forbidOnly: !!process.env.CI, + /* Retry on CI only */ + retries: process.env.CI ? 2 : 0, + /* Enable parallel tests with explicit worker count */ + workers: process.env.CI ? 1 : 4, // Use 4 workers locally, 1 in CI + /* Reporter to use. See https://playwright.dev/docs/test-reporters */ + reporter: [ + ['html'], + ['list'] // Add list reporter for better console output + ], + /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ + use: { + /* Base URL to use in actions like `await page.goto('/')`. */ + baseURL: config.urls.baseUrl, + + /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ + trace: 'on-first-retry', + + /* Capture screenshot on failure */ + screenshot: 'only-on-failure', + + /* Slow down execution for better visibility during debugging */ + launchOptions: { + slowMo: process.env.DEBUG ? 100 : 0, + }, + }, + + /* Configure projects for major browsers */ + projects: [ + { + name: 'chromium', + use: { + ...devices['Desktop Chrome'], + viewport: { width: 1920, height: 1080 } + }, + }, + + // { + // name: 'firefox', + // use: { ...devices['Desktop Firefox'] }, + // }, + + // { + // name: 'webkit', + // use: { ...devices['Desktop Safari'] }, + // }, + + /* Test against mobile viewports. */ + // { + // name: 'Mobile Chrome', + // use: { ...devices['Pixel 5'] }, + // }, + // { + // name: 'Mobile Safari', + // use: { ...devices['iPhone 12'] }, + // }, + + /* Test against branded browsers. */ + // { + // name: 'Microsoft Edge', + // use: { ...devices['Desktop Edge'], channel: 'msedge' }, + // }, + // { + // name: 'Google Chrome', + // use: { ...devices['Desktop Chrome'], channel: 'chrome' }, + // }, + ], + + /* Run your local dev server before starting the tests */ + // webServer: { + // command: 'npm run start', + // url: 'http://127.0.0.1:3000', + // reuseExistingServer: !process.env.CI, + // }, +}); \ No newline at end of file diff --git a/quadratic-ai-eval/tests/ai-evaluators.ts b/quadratic-ai-eval/tests/ai-evaluators.ts new file mode 100644 index 0000000000..9ea8889539 --- /dev/null +++ b/quadratic-ai-eval/tests/ai-evaluators.ts @@ -0,0 +1,392 @@ +/** + * AI Evaluation Utilities + * + * This file contains functions for evaluating UI screenshots with different AI models. + * Currently supports Claude and OpenAI models. + */ + +import { Anthropic } from '@anthropic-ai/sdk'; +import * as fs from 'fs'; +import { OpenAI } from 'openai'; +import { z } from 'zod'; +import config from './config'; +import type { PromptTest } from './prompt-tests'; + +// Define the schema for AI evaluation response +export const EvaluationSchema = z.object({ + criteria_evaluations: z.array( + z.object({ + criterion: z.string(), + met: z.enum(['YES', 'PARTIALLY', 'NO']), + explanation: z.string() + }) + ), + rating: z.enum(['GREEN', 'YELLOW', 'RED']), + explanation: z.string().min(1), + confidence: z.enum(['HIGH', 'MEDIUM', 'LOW']).optional() +}); + +export type Evaluation = z.infer; +export type AIProvider = 'claude' | 'openai'; + +// Define the model configuration interface +export interface ModelConfig { + provider: AIProvider; + modelName: string; + maxTokens?: number; + systemPrompt?: string; + evaluationPrompt?: string; +} + +export interface EvaluationResult { + rating: string; + explanation: string; + validationStatus: string; + criteriaEvaluations: any[]; + satisfactionPercentage: string; + confidence: string; +} + +/** + * Simplified evaluation function for screenshots + * + * @param modelConfig The AI model configuration to use + * @param screenshotPath Path to the screenshot + * @param validationCriteriaString A string of validation criteria + * @param promptText The original prompt text that generated the result + * @returns Evaluation result + */ +export async function simpleAiEval( + modelConfig: ModelConfig | AIProvider, + screenshotPath: string, + validationCriteriaString: string, + promptText: string +): Promise { + // Convert screenshot to base64 for sending to AI models + const screenshotBase64 = fs.readFileSync(screenshotPath, { encoding: 'base64' }); + + // If modelConfig is just a provider string, convert it to a ModelConfig + const resolvedConfig = typeof modelConfig === 'string' + ? getDefaultModelConfig(modelConfig) + : modelConfig; + + switch (resolvedConfig.provider) { + case 'claude': + return evaluateWithClaude(screenshotBase64, promptText, validationCriteriaString, resolvedConfig); + case 'openai': + return evaluateWithOpenAI(screenshotBase64, promptText, validationCriteriaString, resolvedConfig); + default: + throw new Error(`Unsupported provider: ${resolvedConfig.provider}`); + } +} + +/** + * Main function to evaluate a screenshot with a specified AI model + * + * @param modelConfig The AI model configuration to use + * @param screenshotPath Path to the screenshot file + * @param promptTest The prompt test object containing criteria and prompt + * @returns Evaluation result with rating, explanation, and validation status + */ +export async function aiEval( + modelConfig: ModelConfig | AIProvider, + screenshotPath: string, + promptTest: PromptTest +): Promise { + // Convert screenshot to base64 for sending to AI models + const screenshotBase64 = fs.readFileSync(screenshotPath, { encoding: 'base64' }); + + // Prepare the validation criteria text + const criteriaText = promptTest.validationCriteria + .map((criteria, index) => `${index + 1}. ${criteria}`) + .join('\n'); + + // If modelConfig is just a provider string, convert it to a ModelConfig + const resolvedConfig = typeof modelConfig === 'string' + ? getDefaultModelConfig(modelConfig) + : modelConfig; + + switch (resolvedConfig.provider) { + case 'claude': + return evaluateWithClaude(screenshotBase64, promptTest.prompt, criteriaText, resolvedConfig); + case 'openai': + return evaluateWithOpenAI(screenshotBase64, promptTest.prompt, criteriaText, resolvedConfig); + default: + throw new Error(`Unsupported provider: ${resolvedConfig.provider}`); + } +} + +/** + * Get the default model configuration for a provider + * + * @param provider The AI provider ('claude' or 'openai') + * @returns Model configuration with default values + */ +function getDefaultModelConfig(provider: AIProvider): ModelConfig { + switch (provider) { + case 'claude': { + const claudeModel = config.models.find(model => model.provider === 'anthropic'); + if (!claudeModel) { + throw new Error('No Claude model configuration found'); + } + return { + provider: 'claude', + modelName: claudeModel.id, + maxTokens: claudeModel.maxTokens, + systemPrompt: claudeModel.systemPrompt, + evaluationPrompt: claudeModel.evaluationPrompt + }; + } + case 'openai': { + const openaiModel = config.models.find(model => model.provider === 'openai'); + if (!openaiModel) { + throw new Error('No OpenAI model configuration found'); + } + return { + provider: 'openai', + modelName: openaiModel.id, + maxTokens: openaiModel.maxTokens, + systemPrompt: openaiModel.systemPrompt, + evaluationPrompt: openaiModel.evaluationPrompt + }; + } + default: + throw new Error(`Unsupported provider: ${provider}`); + } +} + +/** + * Evaluate a screenshot with Claude + * + * @param screenshotBase64 Base64-encoded screenshot + * @param promptText The original prompt text + * @param criteriaText Formatted validation criteria + * @param modelConfig The Claude model configuration + * @returns Evaluation result + */ +async function evaluateWithClaude( + screenshotBase64: string, + promptText: string, + criteriaText: string, + modelConfig: ModelConfig +): Promise { + // Check if ANTHROPIC_API_KEY is set + if (!process.env.ANTHROPIC_API_KEY) { + throw new Error('ANTHROPIC_API_KEY environment variable is not set'); + } + + // Initialize Anthropic client + const anthropic = new Anthropic({ + apiKey: process.env.ANTHROPIC_API_KEY, + }); + + // Prepare the prompt for Claude using the template from config + const claudeModel = config.models.find(model => model.provider === 'anthropic'); + const claudePrompt = (modelConfig.evaluationPrompt || (claudeModel?.evaluationPrompt || '')) + .replace('{promptText}', promptText) + .replace('{criteriaText}', criteriaText); + + try { + const message = await anthropic.messages.create({ + model: modelConfig.modelName, + max_tokens: modelConfig.maxTokens || (claudeModel?.maxTokens || 1000), + system: modelConfig.systemPrompt || (claudeModel?.systemPrompt || ''), + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: claudePrompt + }, + { + type: "image", + source: { + type: "base64", + media_type: "image/png", + data: screenshotBase64 + } + } + ] + } + ] + }); + + // The content is an array of content blocks, we need to check the type + const contentBlock = message.content[0]; + let evaluationText = ''; + + // Check if the content block is of type 'text' + if (contentBlock.type === 'text') { + evaluationText = contentBlock.text; + } else { + evaluationText = JSON.stringify(contentBlock); + } + + return parseAIResponse(evaluationText); + } catch (error) { + console.error('Error evaluating with Claude:', error); + return { + rating: 'UNKNOWN', + explanation: `Error evaluating with Claude: ${error}`, + validationStatus: 'FAILED', + criteriaEvaluations: [], + satisfactionPercentage: '0%', + confidence: 'LOW' + }; + } +} + +/** + * Evaluate a screenshot with OpenAI + * + * @param screenshotBase64 Base64-encoded screenshot + * @param promptText The original prompt text + * @param criteriaText Formatted validation criteria + * @param modelConfig The OpenAI model configuration + * @returns Evaluation result + */ +async function evaluateWithOpenAI( + screenshotBase64: string, + promptText: string, + criteriaText: string, + modelConfig: ModelConfig +): Promise { + // Check if OPENAI_API_KEY is set + if (!process.env.OPENAI_API_KEY) { + throw new Error('OPENAI_API_KEY environment variable is not set'); + } + + // Initialize OpenAI client + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }); + + // Prepare the prompt for OpenAI using the template from config + const openaiModel = config.models.find(model => model.provider === 'openai'); + const openaiPrompt = (modelConfig.evaluationPrompt || (openaiModel?.evaluationPrompt || '')) + .replace('{promptText}', promptText) + .replace('{criteriaText}', criteriaText); + + try { + const response = await openai.chat.completions.create({ + model: modelConfig.modelName, + messages: [ + { + role: "system", + content: modelConfig.systemPrompt || (openaiModel?.systemPrompt || '') + }, + { + role: "user", + content: [ + { + type: "text", + text: openaiPrompt + }, + { + type: "image_url", + image_url: { + url: `data:image/png;base64,${screenshotBase64}`, + detail: "high" + } + } + ] + } + ], + max_tokens: modelConfig.maxTokens || (openaiModel?.maxTokens || 1000) + }); + + const evaluationText = response.choices[0]?.message?.content || ''; + return parseAIResponse(evaluationText); + } catch (error) { + console.error('Error evaluating with OpenAI:', error); + return { + rating: 'UNKNOWN', + explanation: `Error evaluating with OpenAI: ${error}`, + validationStatus: 'FAILED', + criteriaEvaluations: [], + satisfactionPercentage: '0%', + confidence: 'LOW' + }; + } +} + +/** + * Parse the AI response text into a structured evaluation result + * + * @param evaluationText The raw text response from the AI model + * @returns Evaluation result with validation status + */ +function parseAIResponse(evaluationText: string): EvaluationResult { + try { + // Find JSON in the response text + let jsonText = evaluationText; + const jsonMatch = evaluationText.match(/(\{[\s\S]*\})/); + if (jsonMatch) { + jsonText = jsonMatch[0]; + } + + // Parse the JSON + const parsedJson = JSON.parse(jsonText); + + // For our simplified format, we'll just check if the required fields exist + if (parsedJson.criteria_evaluations && parsedJson.overall_satisfaction && parsedJson.explanation) { + // Return the validated data with a PASSED status + + // Use the overall satisfaction directly from the response + const satisfactionScore = parsedJson.overall_satisfaction; + + // For backwards compatibility, derive a rating from the satisfaction score + let rating = 'RED'; + if (satisfactionScore >= 80) { + rating = 'GREEN'; + } else if (satisfactionScore >= 50) { + rating = 'YELLOW'; + } + + return { + rating: rating, + explanation: parsedJson.explanation, + validationStatus: 'PASSED', + criteriaEvaluations: parsedJson.criteria_evaluations, + satisfactionPercentage: satisfactionScore.toFixed(1) + '%', + confidence: 'HIGH' // We're no longer using confidence levels + }; + } else { + // Try to extract just the satisfaction and explanation + const fallbackSatisfaction = parsedJson.overall_satisfaction || 0; + const fallbackExplanation = parsedJson.explanation || 'No explanation provided'; + + // For backwards compatibility, derive a rating from the satisfaction score + let fallbackRating = 'RED'; + if (fallbackSatisfaction >= 80) { + fallbackRating = 'GREEN'; + } else if (fallbackSatisfaction >= 50) { + fallbackRating = 'YELLOW'; + } + + console.warn('JSON validation failed. Missing required fields.'); + console.warn('Attempting fallback parsing with just satisfaction and explanation'); + + return { + rating: fallbackRating, + explanation: fallbackExplanation, + validationStatus: 'PARTIAL', + criteriaEvaluations: [], + satisfactionPercentage: fallbackSatisfaction.toFixed(1) + '%', + confidence: 'LOW' + }; + } + } catch (error) { + // Handle JSON parsing errors + console.error('Error parsing AI response:', error); + return { + rating: 'UNKNOWN', + explanation: `Error parsing AI response: ${error}. Received: ${evaluationText}`, + validationStatus: 'FAILED', + criteriaEvaluations: [], + satisfactionPercentage: '0%', + confidence: 'LOW' + }; + } +} \ No newline at end of file diff --git a/quadratic-ai-eval/tests/config.ts b/quadratic-ai-eval/tests/config.ts new file mode 100644 index 0000000000..8811e28408 --- /dev/null +++ b/quadratic-ai-eval/tests/config.ts @@ -0,0 +1,119 @@ +/** + * Configuration for the Quadratic AI Evaluation tests + */ + +export const config = { + // Authentication + auth: { + email: process.env.AUTH_EMAIL || '', + password: process.env.AUTH_PASSWORD || '', + loginUrl: 'https://qa.quadratic-preview.com/', + redirectUrl: /teams/ + }, + + // Timeouts (in milliseconds) + timeouts: { + navigation: 30000, + elementVisibility: 15000, + generation: 300000 // 5 minutes max for AI generation + }, + + // Models configuration - enable as many as needed + models: [ + { + provider: 'anthropic', + id: 'claude-3-7-sonnet-20250219', + maxTokens: 1000, + temperature: 0.2, + systemPrompt: `You are an expert evaluator of data visualizations with extensive experience in spreadsheet analysis. +Your task is to objectively evaluate if a spreadsheet visualization meets the requirements of a prompt. +Always respond with valid JSON in the exact format requested, without any preamble, explanations outside the JSON, or markdown formatting. +Focus exclusively on what is visible in the image provided, not what you think should be there.`, + evaluationPrompt: ` +I'm showing you a screenshot of a Quadratic spreadsheet that was generated from the prompt: "{promptText}". + +Carefully analyze the image and evaluate if the result correctly implements what was requested in the prompt. + +Specific criteria to evaluate: +{criteriaText} + +For each criterion, provide a satisfaction score from 0-100, where: +- 0: The criterion is not met at all +- 50: The criterion is partially met +- 100: The criterion is fully met + +Provide your evaluation in this exact JSON format: +{ + "criteria_evaluations": [ + {"criterion": "Criterion 1", "satisfaction_score": 85, "explanation": "Brief explanation"}, + {"criterion": "Criterion 2", "satisfaction_score": 50, "explanation": "Brief explanation"}, + ... + ], + "overall_satisfaction": 75, + "explanation": "Your detailed explanation summarizing your evaluation" +} + +Your response must be valid JSON that can be parsed programmatically. +` + }, + { + provider: 'openai', + id: 'gpt-4o', + maxTokens: 1000, + temperature: 0.2, + systemPrompt: `You are an expert evaluator of data visualizations with extensive experience in spreadsheet analysis. +Your task is to objectively evaluate if a spreadsheet visualization meets the requirements of a prompt. +Always respond with valid JSON in the exact format requested, without any preamble, explanations outside the JSON, or markdown formatting. +Focus exclusively on what is visible in the image provided, not what you think should be there.`, + evaluationPrompt: ` +I'm showing you a screenshot of a Quadratic spreadsheet that was generated from the prompt: "{promptText}". + +Carefully analyze the image and evaluate if the result correctly implements what was requested in the prompt. + +Specific criteria to evaluate: +{criteriaText} + +For each criterion, provide a satisfaction score from 0-100, where: +- 0: The criterion is not met at all +- 50: The criterion is partially met +- 100: The criterion is fully met + +Provide your evaluation in this exact JSON format: +{ + "criteria_evaluations": [ + {"criterion": "Criterion 1", "satisfaction_score": 85, "explanation": "Brief explanation"}, + {"criterion": "Criterion 2", "satisfaction_score": 50, "explanation": "Brief explanation"}, + ... + ], + "overall_satisfaction": 75, + "explanation": "Your detailed explanation summarizing your evaluation" +} + +Your response must be valid JSON that can be parsed programmatically. +` + } + ], + + // Test execution + execution: { + // Set to true to run tests in parallel, false to run sequentially + parallel: true, + // Maximum number of parallel tests (if parallel is true) + maxWorkers: 4, + // Simple scoring configuration + scoring: { + // Enable scoring + enabled: true, + // Threshold for passing (0.0 to 100.0) - test passes if average satisfaction score >= threshold + passThreshold: 80 + } + }, + + // URLs + urls: { + baseUrl: 'https://qa.quadratic-preview.com', + createFileWithPrompt: '/files/create?prompt=' + } +}; + +export default config; \ No newline at end of file diff --git a/quadratic-ai-eval/tests/evaluation-scoring.ts b/quadratic-ai-eval/tests/evaluation-scoring.ts new file mode 100644 index 0000000000..fd593d2ff3 --- /dev/null +++ b/quadratic-ai-eval/tests/evaluation-scoring.ts @@ -0,0 +1,49 @@ +/** + * Evaluation Scoring Utilities + * + * This file contains functions for calculating aggregate scores based on AI model evaluations. + */ + +import type { EvaluationResult } from './ai-evaluators'; +import config from './config'; + +/** + * Calculates an aggregate satisfaction score from multiple model evaluations + * + * @param evaluations Array of evaluation results from different models + * @returns A score between 0.0 and 100.0 representing overall satisfaction + */ +export function calculateAggregateScore(evaluations: EvaluationResult[]): number { + if (!evaluations.length) return 0; + + // Extract the overall satisfaction scores from each evaluation + const satisfactionScores = evaluations.map(evaluation => { + // Parse the satisfaction percentage from string to number + const matchResult = evaluation.satisfactionPercentage?.match(/(\d+(\.\d+)?)/); + return matchResult ? parseFloat(matchResult[0]) : 0; + }); + + // Calculate the average satisfaction score + const totalScore = satisfactionScores.reduce((sum, score) => sum + score, 0); + return totalScore / satisfactionScores.length; +} + +/** + * Determines if the test passes based on the aggregate score + * + * @param aggregateScore The calculated aggregate score + * @returns Boolean indicating if the test passes + */ +export function doesPassThreshold(aggregateScore: number): boolean { + return aggregateScore >= config.execution.scoring.passThreshold; +} + +/** + * Formats the aggregate score as a percentage string + * + * @param aggregateScore The calculated aggregate score + * @returns Formatted percentage string (e.g. "95.0%") + */ +export function formatScorePercentage(aggregateScore: number): string { + return aggregateScore.toFixed(1) + '%'; +} \ No newline at end of file diff --git a/quadratic-ai-eval/tests/prompt-evaluation.spec.ts b/quadratic-ai-eval/tests/prompt-evaluation.spec.ts new file mode 100644 index 0000000000..cb11281c72 --- /dev/null +++ b/quadratic-ai-eval/tests/prompt-evaluation.spec.ts @@ -0,0 +1,280 @@ +import { expect, test } from '@playwright/test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { aiEval, EvaluationResult, ModelConfig } from './ai-evaluators'; +import config from './config'; +import { calculateAggregateScore, doesPassThreshold, formatScorePercentage } from './evaluation-scoring'; +import { testPrompts } from './prompt-tests'; + +// Helper function to handle login +async function login(page: any) { + // Navigate to the homepage + await page.goto(config.auth.loginUrl); + + // Wait for Auth0 login page to load + await page.waitForSelector('input[name="username"]', { timeout: config.timeouts.navigation }); + + // Fill in Auth0 login form + const emailInput = page.locator('input[name="username"]'); + const passwordInput = page.locator('input[name="password"]'); + + await expect(emailInput).toBeVisible({ timeout: config.timeouts.elementVisibility }); + await emailInput.fill(config.auth.email); + + await expect(passwordInput).toBeVisible({ timeout: config.timeouts.elementVisibility }); + await passwordInput.fill(config.auth.password); + + // press enter + await page.keyboard.press('Enter'); + + // Wait for redirect back to Quadratic after successful login + await page.waitForURL(config.auth.redirectUrl, { timeout: config.timeouts.navigation }); +} + +// Helper function to format criteria evaluations for display +function formatCriteriaEvaluations(criteriaEvaluations) { + if (!criteriaEvaluations || criteriaEvaluations.length === 0) { + return 'No criteria evaluations available'; + } + + return criteriaEvaluations.map((ce, idx) => + `${idx + 1}. ${ce.criterion}: ${ce.met} - ${ce.explanation}` + ).join('\n'); +} + +// Run a single consensus test for each prompt +test.describe('Quadratic AI Prompt Tests', () => { + for (const promptTest of testPrompts) { + test(promptTest.name, async ({ page }) => { + // Check if we have at least one API key + const hasClaudeKey = !!process.env.ANTHROPIC_API_KEY; + const hasOpenAIKey = !!process.env.OPENAI_API_KEY; + + if (!hasClaudeKey && !hasOpenAIKey) { + test.skip(true, 'Neither ANTHROPIC_API_KEY nor OPENAI_API_KEY environment variables are set. Skipping evaluation.'); + return; + } + + // Collect the model configs we can use based on available API keys + const modelConfigs: ModelConfig[] = []; + + if (hasClaudeKey) { + const claudeModel = config.models.find(model => model.provider === 'anthropic'); + if (claudeModel) { + modelConfigs.push({ + provider: 'claude', + modelName: claudeModel.id, + maxTokens: claudeModel.maxTokens + }); + } + } + + if (hasOpenAIKey) { + const openaiModel = config.models.find(model => model.provider === 'openai'); + if (openaiModel) { + modelConfigs.push({ + provider: 'openai', + modelName: openaiModel.id, + maxTokens: openaiModel.maxTokens + }); + } + } + + // Navigate and wait for the result to be generated + await login(page); + await page.goto(config.urls.createFileWithPrompt + encodeURIComponent(promptTest.prompt)); + + // Wait for the cancel button to disappear (if it was present) + await page.waitForSelector('button:has-text("Cancel generating")', { timeout: config.timeouts.elementVisibility }); + await page.waitForSelector('button:has-text("Cancel generating")', { state: 'hidden', timeout: config.timeouts.generation }); + + // Take a screenshot for AI evaluation + const testResultsDir = path.join(__dirname, '../test-results'); + if (!fs.existsSync(testResultsDir)) { + fs.mkdirSync(testResultsDir, { recursive: true }); + } + + const timestamp = Date.now(); + const screenshotPath = path.join(testResultsDir, `${promptTest.name.replace(/\s+/g, '-')}-${timestamp}.png`); + + const canvasElement = page.locator('#QuadraticCanvasID'); + await expect(canvasElement).toBeVisible({ timeout: config.timeouts.elementVisibility }); + await canvasElement.screenshot({ path: screenshotPath }); + + // Attach the screenshot to the test report + await test.info().attach(`${promptTest.name}-result.png`, { + path: screenshotPath, + contentType: 'image/png' + }); + + // If we only have one model, just use that for evaluation + if (modelConfigs.length === 1) { + const modelConfig = modelConfigs[0]; + const result = await aiEval(modelConfig, screenshotPath, promptTest); + + // Get model info for reporting + const providerName = modelConfig.provider; + const modelName = modelConfig.modelName; + + // Format the evaluation result for test output + const evaluationSummary = ` +=== SINGLE MODEL EVALUATION RESULTS === +Test: ${promptTest.name} +Prompt: "${promptTest.prompt}" +Provider: ${providerName} +Model: ${modelName} +Rating: ${result.rating} +Validation: ${result.validationStatus} +Confidence: ${result.confidence} +Satisfaction: ${result.satisfactionPercentage} +--- +${result.explanation} +--- +${formatCriteriaEvaluations(result.criteriaEvaluations)} +=========================== +`; + + // Add the evaluation to the test report + test.info().annotations.push({ + type: 'AI Evaluation', + description: evaluationSummary + }); + + // Assertions based on the rating and validation status + expect(result.validationStatus, 'AI response validation failed').toBe('PASSED'); + + if (result.validationStatus === 'PASSED' && promptTest.expectedRating) { + // Only assert on rating if validation passed and expected rating is specified + expect(result.rating, `AI evaluation indicates issues with the result for "${promptTest.name}"`).toBe(promptTest.expectedRating); + } + + return; // End test early if only one model + } + + // With multiple models, run consensus evaluation + // Collect evaluations from each model + const evaluations: EvaluationResult[] = []; + for (const modelConfig of modelConfigs) { + try { + const result = await aiEval(modelConfig, screenshotPath, promptTest); + + // Add individual model results to the test report + const individualSummary = ` +=== ${modelConfig.provider.toUpperCase()} EVALUATION === +Rating: ${result.rating} +Confidence: ${result.confidence} +Satisfaction: ${result.satisfactionPercentage} +--- +${formatCriteriaEvaluations(result.criteriaEvaluations)} +`; + + test.info().annotations.push({ + type: `${modelConfig.provider.charAt(0).toUpperCase() + modelConfig.provider.slice(1)} Evaluation`, + description: individualSummary + }); + + evaluations.push(result); + } catch (error) { + console.error(`Error evaluating with ${modelConfig.provider}:`, error); + } + } + + // Skip if we didn't get at least 2 successful evaluations when we have 2+ models + if (evaluations.length < 2 && modelConfigs.length >= 2) { + test.skip(true, `Not enough successful evaluations for consensus (got ${evaluations.length}, need at least 2)`); + return; + } + + // Collect ratings and confidences + const ratings = evaluations.map(e => e.rating); + const confidences = evaluations.map(e => e.confidence); + + // Check for consensus with confidence weighting if enabled + let majorityRating = ''; + let maxWeight = 0; + + // Simple counting without weighting + const counts: Record = {}; + ratings.forEach(rating => { + counts[rating] = (counts[rating] || 0) + 1; + }); + + // Find the majority rating + let maxCount = 0; + Object.entries(counts).forEach(([rating, count]) => { + if (count > maxCount) { + majorityRating = rating; + maxCount = count; + } + }); + + // Calculate agreement percentage + var agreementPercentage = maxCount / ratings.length; + + // Create consensus result + const consensusResult = { + rating: majorityRating, + agreementPercentage: (agreementPercentage * 100).toFixed(1) + '%', + ratings: ratings.join(', '), + confidences: confidences.join(', '), + models: modelConfigs.map(mc => `${mc.provider}:${mc.modelName}`).join(', '), + expectedRating: promptTest.expectedRating || 'Not specified' + }; + + // Calculate aggregate satisfaction score if enabled + let aggregateScore = 0; + let passesThreshold = false; + let scorePercentage = '0.0%'; + + if (config.execution.scoring?.enabled) { + aggregateScore = calculateAggregateScore(evaluations); + passesThreshold = doesPassThreshold(aggregateScore); + scorePercentage = formatScorePercentage(aggregateScore); + + // Add score information to consensus result + Object.assign(consensusResult, { + aggregateScore, + scorePercentage, + passesThreshold + }); + } + + // Format consensus result for test output + const consensusSummary = ` +=== CONSENSUS EVALUATION RESULTS === +Test: ${promptTest.name} +Prompt: "${promptTest.prompt}" +Models used: ${consensusResult.models} +Individual ratings: ${consensusResult.ratings} +Individual confidences: ${consensusResult.confidences} +Majority rating: ${consensusResult.rating} +${config.execution.scoring?.enabled ? `Aggregate satisfaction score: ${scorePercentage} +Passes threshold (${config.execution.scoring.passThreshold}%): ${passesThreshold ? 'YES' : 'NO'}` : ''} +Expected rating: ${consensusResult.expectedRating} +=========================== +`; + + // Add the consensus to the test report + test.info().annotations.push({ + type: 'Consensus Evaluation', + description: consensusSummary + }); + + // Use the first evaluation's detailed explanation + const detailedExplanation = evaluations[0].explanation; + test.info().annotations.push({ + type: 'Detailed Explanation', + description: detailedExplanation + }); + + // Assertions when we have multiple models + if (modelConfigs.length >= 2) { + // Only check aggregate score threshold + expect( + passesThreshold, + `Aggregate satisfaction score (${scorePercentage}) is below threshold (${config.execution.scoring.passThreshold}%)` + ).toBe(true); + } + }); + } +}); \ No newline at end of file diff --git a/quadratic-ai-eval/tests/prompt-tests.ts b/quadratic-ai-eval/tests/prompt-tests.ts new file mode 100644 index 0000000000..52a4f6690a --- /dev/null +++ b/quadratic-ai-eval/tests/prompt-tests.ts @@ -0,0 +1,89 @@ +// Define the schema for test prompts +export interface PromptTest { + prompt: string; + name: string; + validationCriteria: string[]; + expectedRating?: 'GREEN' | 'YELLOW' | 'RED'; +} + +// Define the test prompts and their validation criteria +export const testPrompts: PromptTest[] = [ + { + name: 'States GDP Map', + prompt: 'Return a table of states with gdp per capita, and plot it on a map, use state abbreviations for the map', + validationCriteria: [ + 'Does the spreadsheet contain a table of states with GDP per capita?', + 'Is there a map visualization of this data?', + 'Is the data properly formatted and presented?', + 'Are there any obvious errors or issues?', + 'Are the states on the map correctly colored?' + ], + expectedRating: 'GREEN' + }, + { + name: 'First 500 Prime Numbers', + prompt: 'return the first 500 prime numbers in a list', + validationCriteria: [ + 'Does the spreadsheet contain a table of the first 500 prime numbers? It\'s ok if it\'s cut off and all the numbers are not visible.', + ], + expectedRating: 'GREEN' + }, + { + name: 'Stock Price Trends', + prompt: 'Show me the stock price trends for AAPL, MSFT, GOOG, and AMZN over the past 5 years with a line chart.', + validationCriteria: [ + 'Does the spreadsheet contain stock price data for AAPL, MSFT, GOOG, and AMZN?', + 'Is there a line chart visualization showing the trends?', + 'Does the chart cover approximately a 5-year period?', + 'Are the lines properly labeled or is there a legend?', + 'Is the data properly formatted and presented?' + ], + expectedRating: 'GREEN' + }, + { + name: 'COVID-19 Cases by Country', + prompt: 'Create a bar chart showing COVID-19 cases by country for the top 10 most affected countries.', + validationCriteria: [ + 'Does the spreadsheet contain COVID-19 case data for countries?', + 'Is there a bar chart visualization showing the top 10 countries?', + 'Are the countries sorted by number of cases?', + 'Are the bars properly labeled?', + 'Is the data properly formatted and presented?' + ], + expectedRating: 'GREEN' + }, + { + name: 'Basic Chart Creation', + prompt: 'Insert a line chart with the x axis as dates and y axis as number of sales, first generate sample data for that chart', + validationCriteria: [ + 'Does the spreadsheet contain a dataset of a time series of date and sales values?', + 'Is there a line chart visualization showcasing the sales over time?', + 'Is the data properly formatted and presented?', + 'Are there any obvious errors or issues?' + ], + expectedRating: 'GREEN' + }, + { + name: 'DataFrame Manipulation', + prompt: 'Insert the following dataset and then use python to reference the data and make it cleaner/more readable, then display the cleaned dataset\nProduct Name\tPrice\tIn Stock\tDate Added\tRating\nlaptop pro 15"\t1299.99\tYes\t2023-01-15\t4.7/5\nSMARTPHONE X\t899\tNO\t2023-01-22\t3.9 stars\nwireless headphones\t129.95\tyes\t2023-02-05\t4.2/5\n49.99\tYes\t2023-01-30\t4.0 stars\nGAMING MOUSE\t79.99\t\t2023-02-15\t4.5/5\nexternal SSD 1TB\t159.99\tYES\t2023-03-01\t\nmechanical keyboard\t149.95\tno\t2023-03-10\t4.8 stars\nmonitor 27"\t249.99\tYes\t2023-02-20\t4.3/5', + validationCriteria: [ + 'Does the spreadsheet show the original dataset?', + 'Is there a cleaned version of the dataset displayed?', + 'Is the cleaned data more legible and easier to work with?', + 'Are there any obvious errors or issues?' + ], + expectedRating: 'GREEN' + }, + { + name: 'DataFrame Star Rating Count', + prompt: 'Insert the following dataset and then count the number of items that have 4 stars or greater; display the answer as "Number: " and the answer\nProduct Name\tPrice\tIn Stock\tDate Added\tRating\nlaptop pro 15"\t1299.99\tYes\t2023-01-15\t4.7/5\nSMARTPHONE X\t899\tNO\t2023-01-22\t3.9 stars\nwireless headphones\t129.95\tyes\t2023-02-05\t4.2/5\n49.99\tYes\t2023-01-30\t4.0 stars\nGAMING MOUSE\t79.99\t\t2023-02-15\t4.5/5\nexternal SSD 1TB\t159.99\tYES\t2023-03-01\t\nmechanical keyboard\t149.95\tno\t2023-03-10\t4.8 stars\nmonitor 27"\t249.99\tYes\t2023-02-20\t4.3/5', + validationCriteria: [ + 'Does the spreadsheet show the dataset?', + 'Is the correct count of items with 4 stars or greater displayed as "Number: 6"?', + 'Is the data properly formatted and presented?', + 'Are there any obvious errors or issues?' + ], + expectedRating: 'GREEN' + }, + // Add more test prompts as needed +];