diff --git a/README.md b/README.md
index 9b732fed84..8cf36993aa 100644
--- a/README.md
+++ b/README.md
@@ -26,3 +26,70 @@ Want to contribute? Read our [Contributing Guide](./CONTRIBUTING.md)
 ## Quadratic is hiring
 
 Check out our open roles ⟶ [careers.quadratichq.com](https://careers.quadratichq.com)
+
+# Quadratic QA Test Suite
+
+This repository contains automated tests for the Quadratic application using Playwright.
+
+## Prerequisites
+
+- Node.js (v14 or newer)
+- npm
+
+## Setup
+
+1. Install dependencies:
+   ```
+   npm install
+   ```
+
+2. Install Playwright browsers:
+   ```
+   npx playwright install
+   ```
+
+## Running Tests
+
+Run all tests:
+```
+npm test
+```
+
+Run tests with UI mode (for debugging and development):
+```
+npm run test:ui
+```
+
+Run tests in headed mode (with visible browser):
+```
+npm run test:headed
+```
+
+Run tests in debug mode:
+```
+npm run test:debug
+```
+
+View the HTML report after a test run:
+```
+npm run report
+```
+
+## Test Structure
+
+- `tests/homepage.spec.ts` - Basic tests for the homepage
+- `tests/login.spec.ts` - Tests for login functionality
+- `tests/spreadsheet.spec.ts` - Tests for spreadsheet functionality
+
+## Notes
+
+- These tests are designed for the Quadratic application at `app.quadratichq.com`
+- The selectors in the tests may need to be updated based on the actual UI
+- Screenshots are saved in the project root directory
+
+## Adding New Tests
+
+1. Create a new file in the `tests` directory with a `.spec.ts` extension
+2. Import the necessary Playwright modules
+3. Write your tests using the Playwright API
+4. Run the tests to verify they work as expected
diff --git a/package-lock.json b/package-lock.json
index dab2d3e06c..a5e288b012 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -29,6 +29,7 @@
         "chalk": "^5.3.0",
         "commander": "^11.1.0",
         "concurrently": "^6.5.1",
+        "dotenv": "^16.4.7",
         "eslint": "^8.57.0",
         "jest": "^29.6.1",
         "kill-port": "^2.0.1",
@@ -19908,7 +19909,9 @@
       }
     },
     "node_modules/dotenv": {
-      "version": "16.4.4",
+      "version": "16.4.7",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz",
+      "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==",
       "license": "BSD-2-Clause",
       "engines": {
         "node": ">=12"
diff --git a/package.json b/package.json
index c33d674f5d..81d162611b 100644
--- a/package.json
+++ b/package.json
@@ -84,6 +84,7 @@
     "chalk": "^5.3.0",
     "commander": "^11.1.0",
     "concurrently": "^6.5.1",
+    "dotenv": "^16.4.7",
     "eslint": "^8.57.0",
     "jest": "^29.6.1",
     "kill-port": "^2.0.1",
diff --git a/quadratic-ai-eval/.gitignore b/quadratic-ai-eval/.gitignore
new file mode 100644
index 0000000000..7897af19c6
--- /dev/null
+++ b/quadratic-ai-eval/.gitignore
@@ -0,0 +1,50 @@
+# Playwright specific
+/test-results/
+/playwright-report/
+/blob-report/
+/playwright/.cache/
+/html-report/
+/allure-results/
+/allure-report/
+
+# Node.js dependencies
+/node_modules/
+/package-lock.json
+/yarn.lock
+/pnpm-lock.yaml
+
+# Environment variables
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+
+# IDE specific files
+.idea/
+.vscode/
+*.code-workspace
+.DS_Store
+
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# Coverage directory
+/coverage/
+
+# Temporary files
+/tmp/
+/temp/
+
+# Screenshots and videos from test runs
+/screenshots/
+/videos/
+
+# User-specific files
+*.user
+*.suo
+*.userprefs 
\ No newline at end of file
diff --git a/quadratic-ai-eval/README.md b/quadratic-ai-eval/README.md
new file mode 100644
index 0000000000..576104e533
--- /dev/null
+++ b/quadratic-ai-eval/README.md
@@ -0,0 +1,121 @@
+# Quadratic AI Evaluation Framework
+
+This framework allows you to test Quadratic's AI capabilities by running multiple prompts in parallel and evaluating the results using Claude.
+
+## Setup
+
+1. Install dependencies:
+```bash
+npm install
+```
+
+2. Set up environment variables:
+```bash
+# Create a .env file with your Anthropic API key
+echo "ANTHROPIC_API_KEY=your_api_key_here" > .env
+```
+
+## Running Tests
+
+To run all tests:
+```bash
+npx playwright test
+```
+
+To run the prompt evaluation tests specifically:
+```bash
+npx playwright test prompt-evaluation.spec.ts
+# or use the npm script
+npm run test:prompt
+```
+
+To run tests with a UI:
+```bash
+npx playwright test --ui
+```
+
+## Test Categories
+
+The framework includes different categories of tests:
+
+- **Basic prompts**: Simple data visualization prompts (run with `npm run test:basic`)
+- **Complex data prompts**: More complex data analysis prompts (run with `npm run test:complex`)
+- **Parallel execution**: Run all tests in parallel (run with `npm run test:parallel`)
+
+## Adding New Tests
+
+To add new test prompts, edit the `tests/prompt-tests.ts` file and add your prompts to the appropriate array:
+
+```typescript
+// For basic data visualization prompts
+export const testPrompts: PromptTest[] = [
+  {
+    name: 'Your Test Name',
+    prompt: 'Your prompt text here',
+    validationCriteria: [
+      'Criterion 1?',
+      'Criterion 2?',
+      'Criterion 3?',
+      'Criterion 4?',
+      'Criterion 5?'
+    ],
+    expectedRating: 'GREEN' // or 'YELLOW' or 'RED'
+  },
+  // Add more test prompts here
+];
+
+// For more complex data analysis prompts
+export const complexDataPrompts: PromptTest[] = [
+  // Add your complex data prompts here
+];
+```
+
+### Test Structure
+
+Each test prompt consists of:
+
+- `name`: A descriptive name for the test
+- `prompt`: The actual prompt to send to Quadratic
+- `validationCriteria`: An array of questions that Claude will use to evaluate the result
+- `expectedRating`: The expected rating from Claude (GREEN, YELLOW, or RED)
+
+## How It Works
+
+1. The test framework logs in to Quadratic using Auth0 credentials
+2. For each prompt in the test arrays:
+   - It navigates to the file creation page with the prompt
+   - Waits for the spreadsheet to be generated
+   - Takes a screenshot of the result
+   - Sends the screenshot to Claude for evaluation
+   - Validates that the result meets the expected criteria
+
+## Evaluation Criteria
+
+Claude evaluates each result and provides a rating:
+
+- **GREEN**: The result looks correct and fully satisfies the prompt requirements
+- **YELLOW**: The result partially satisfies the prompt but has minor issues
+- **RED**: The result is incorrect or has major issues
+
+## Test Reports
+
+Test reports are generated in the `playwright-report` directory. Each test includes:
+
+- The prompt text
+- A screenshot of the result
+- Claude's evaluation
+- The test status (passed/failed)
+
+## Customizing Tests
+
+You can customize the test framework by:
+
+1. Adding new prompt collections in `prompt-tests.ts`
+2. Modifying the evaluation criteria
+3. Adjusting timeouts and other parameters in `config.ts`
+
+## Troubleshooting
+
+- If tests fail with authentication errors, check your Auth0 credentials
+- If Claude evaluation fails, check your Anthropic API key
+- If tests time out, you may need to increase the timeout values in the config file 
\ No newline at end of file
diff --git a/quadratic-ai-eval/package.json b/quadratic-ai-eval/package.json
new file mode 100644
index 0000000000..ead6cb8b01
--- /dev/null
+++ b/quadratic-ai-eval/package.json
@@ -0,0 +1,35 @@
+{
+  "name": "quadratic-ai-eval",
+  "version": "1.0.0",
+  "description": "Playwright test suite for Quadratic with Auth0 authentication",
+  "main": "index.js",
+  "scripts": {
+    "test": "playwright test",
+    "test:ui": "playwright test --ui",
+    "test:headed": "playwright test --headed",
+    "test:debug": "playwright test --debug",
+    "test:dev": "playwright test --headed --debug-brk --timeout=0",
+    "test:basic": "playwright test -g 'Testing basic prompt'",
+    "test:complex": "playwright test -g 'Testing complex data prompt'",
+    "test:parallel": "playwright test --workers=4",
+    "test:prompt": "playwright test prompt-evaluation.spec.ts",
+    "report": "playwright show-report"
+  },
+  "keywords": [
+    "quadratic",
+    "testing",
+    "playwright",
+    "automation",
+    "auth0"
+  ],
+  "author": "",
+  "license": "ISC",
+  "devDependencies": {
+    "@playwright/test": "^1.40.0",
+    "dotenv": "^16.4.7"
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.37.0",
+    "zod": "^3.24.2"
+  }
+}
\ No newline at end of file
diff --git a/quadratic-ai-eval/playwright.config.ts b/quadratic-ai-eval/playwright.config.ts
new file mode 100644
index 0000000000..1bfd113b81
--- /dev/null
+++ b/quadratic-ai-eval/playwright.config.ts
@@ -0,0 +1,101 @@
+import { defineConfig, devices } from '@playwright/test';
+import config from './tests/config';
+
+/**
+ * Read environment variables from file.
+ * https://github.com/motdotla/dotenv
+ */
+require('dotenv').config();
+
+/**
+ * See https://playwright.dev/docs/test-configuration.
+ */
+export default defineConfig({
+  testDir: './tests',
+  /* Maximum time one test can run for. */
+  timeout: config.timeouts.generation + 60000, // Generation timeout + 1 minute buffer
+  expect: {
+    /**
+     * Maximum time expect() should wait for the condition to be met.
+     * For example in `await expect(locator).toHaveText();`
+     */
+    timeout: config.timeouts.elementVisibility
+  },
+  /* Run tests in files in parallel */
+  fullyParallel: true, // Enable parallel execution
+  /* Fail the build on CI if you accidentally left test.only in the source code. */
+  forbidOnly: !!process.env.CI,
+  /* Retry on CI only */
+  retries: process.env.CI ? 2 : 0,
+  /* Enable parallel tests with explicit worker count */
+  workers: process.env.CI ? 1 : 4, // Use 4 workers locally, 1 in CI
+  /* Reporter to use. See https://playwright.dev/docs/test-reporters */
+  reporter: [
+    ['html'],
+    ['list'] // Add list reporter for better console output
+  ],
+  /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
+  use: {
+    /* Base URL to use in actions like `await page.goto('/')`. */
+    baseURL: config.urls.baseUrl,
+
+    /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */
+    trace: 'on-first-retry',
+    
+    /* Capture screenshot on failure */
+    screenshot: 'only-on-failure',
+    
+    /* Slow down execution for better visibility during debugging */
+    launchOptions: {
+      slowMo: process.env.DEBUG ? 100 : 0,
+    },
+  },
+
+  /* Configure projects for major browsers */
+  projects: [
+    {
+      name: 'chromium',
+      use: { 
+        ...devices['Desktop Chrome'],
+        viewport: { width: 1920, height: 1080 }
+      },
+    },
+
+    // {
+    //   name: 'firefox',
+    //   use: { ...devices['Desktop Firefox'] },
+    // },
+
+    // {
+    //   name: 'webkit',
+    //   use: { ...devices['Desktop Safari'] },
+    // },
+
+    /* Test against mobile viewports. */
+    // {
+    //   name: 'Mobile Chrome',
+    //   use: { ...devices['Pixel 5'] },
+    // },
+    // {
+    //   name: 'Mobile Safari',
+    //   use: { ...devices['iPhone 12'] },
+    // },
+
+    /* Test against branded browsers. */
+    // {
+    //   name: 'Microsoft Edge',
+    //   use: { ...devices['Desktop Edge'], channel: 'msedge' },
+    // },
+    // {
+    //   name: 'Google Chrome',
+    //   use: { ...devices['Desktop Chrome'], channel: 'chrome' },
+    // },
+  ],
+
+  /* Run your local dev server before starting the tests */
+  // webServer: {
+  //   command: 'npm run start',
+  //   url: 'http://127.0.0.1:3000',
+  //   reuseExistingServer: !process.env.CI,
+  // },
+}); 
\ No newline at end of file
diff --git a/quadratic-ai-eval/tests/ai-evaluators.ts b/quadratic-ai-eval/tests/ai-evaluators.ts
new file mode 100644
index 0000000000..9ea8889539
--- /dev/null
+++ b/quadratic-ai-eval/tests/ai-evaluators.ts
@@ -0,0 +1,392 @@
+/**
+ * AI Evaluation Utilities
+ * 
+ * This file contains functions for evaluating UI screenshots with different AI models.
+ * Currently supports Claude and OpenAI models.
+ */
+
+import { Anthropic } from '@anthropic-ai/sdk';
+import * as fs from 'fs';
+import { OpenAI } from 'openai';
+import { z } from 'zod';
+import config from './config';
+import type { PromptTest } from './prompt-tests';
+
+// Define the schema for AI evaluation response
+export const EvaluationSchema = z.object({
+  criteria_evaluations: z.array(
+    z.object({
+      criterion: z.string(),
+      met: z.enum(['YES', 'PARTIALLY', 'NO']),
+      explanation: z.string()
+    })
+  ),
+  rating: z.enum(['GREEN', 'YELLOW', 'RED']),
+  explanation: z.string().min(1),
+  confidence: z.enum(['HIGH', 'MEDIUM', 'LOW']).optional()
+});
+
+export type Evaluation = z.infer<typeof EvaluationSchema>;
+export type AIProvider = 'claude' | 'openai';
+
+// Define the model configuration interface
+export interface ModelConfig {
+  provider: AIProvider;
+  modelName: string;
+  maxTokens?: number;
+  systemPrompt?: string;
+  evaluationPrompt?: string;
+}
+
+export interface EvaluationResult {
+  rating: string;
+  explanation: string;
+  validationStatus: string;
+  criteriaEvaluations: any[];
+  satisfactionPercentage: string;
+  confidence: string;
+}
+
+/**
+ * Simplified evaluation function for screenshots
+ * 
+ * @param modelConfig The AI model configuration to use
+ * @param screenshotPath Path to the screenshot
+ * @param validationCriteriaString A string of validation criteria
+ * @param promptText The original prompt text that generated the result
+ * @returns Evaluation result
+ */
+export async function simpleAiEval(
+  modelConfig: ModelConfig | AIProvider,
+  screenshotPath: string,
+  validationCriteriaString: string,
+  promptText: string
+): Promise<EvaluationResult> {
+  // Convert screenshot to base64 for sending to AI models
+  const screenshotBase64 = fs.readFileSync(screenshotPath, { encoding: 'base64' });
+  
+  // If modelConfig is just a provider string, convert it to a ModelConfig
+  const resolvedConfig = typeof modelConfig === 'string'
+    ? getDefaultModelConfig(modelConfig)
+    : modelConfig;
+  
+  switch (resolvedConfig.provider) {
+    case 'claude':
+      return evaluateWithClaude(screenshotBase64, promptText, validationCriteriaString, resolvedConfig);
+    case 'openai':
+      return evaluateWithOpenAI(screenshotBase64, promptText, validationCriteriaString, resolvedConfig);
+    default:
+      throw new Error(`Unsupported provider: ${resolvedConfig.provider}`);
+  }
+}
+
+/**
+ * Main function to evaluate a screenshot with a specified AI model
+ * 
+ * @param modelConfig The AI model configuration to use
+ * @param screenshotPath Path to the screenshot file
+ * @param promptTest The prompt test object containing criteria and prompt
+ * @returns Evaluation result with rating, explanation, and validation status
+ */
+export async function aiEval(
+  modelConfig: ModelConfig | AIProvider,
+  screenshotPath: string,
+  promptTest: PromptTest
+): Promise<EvaluationResult> {
+  // Convert screenshot to base64 for sending to AI models
+  const screenshotBase64 = fs.readFileSync(screenshotPath, { encoding: 'base64' });
+  
+  // Prepare the validation criteria text
+  const criteriaText = promptTest.validationCriteria
+    .map((criteria, index) => `${index + 1}. ${criteria}`)
+    .join('\n');
+  
+  // If modelConfig is just a provider string, convert it to a ModelConfig
+  const resolvedConfig = typeof modelConfig === 'string'
+    ? getDefaultModelConfig(modelConfig)
+    : modelConfig;
+  
+  switch (resolvedConfig.provider) {
+    case 'claude':
+      return evaluateWithClaude(screenshotBase64, promptTest.prompt, criteriaText, resolvedConfig);
+    case 'openai':
+      return evaluateWithOpenAI(screenshotBase64, promptTest.prompt, criteriaText, resolvedConfig);
+    default:
+      throw new Error(`Unsupported provider: ${resolvedConfig.provider}`);
+  }
+}
+
+/**
+ * Get the default model configuration for a provider
+ * 
+ * @param provider The AI provider ('claude' or 'openai')
+ * @returns Model configuration with default values
+ */
+function getDefaultModelConfig(provider: AIProvider): ModelConfig {
+  switch (provider) {
+    case 'claude': {
+      const claudeModel = config.models.find(model => model.provider === 'anthropic');
+      if (!claudeModel) {
+        throw new Error('No Claude model configuration found');
+      }
+      return {
+        provider: 'claude',
+        modelName: claudeModel.id,
+        maxTokens: claudeModel.maxTokens,
+        systemPrompt: claudeModel.systemPrompt,
+        evaluationPrompt: claudeModel.evaluationPrompt
+      };
+    }
+    case 'openai': {
+      const openaiModel = config.models.find(model => model.provider === 'openai');
+      if (!openaiModel) {
+        throw new Error('No OpenAI model configuration found');
+      }
+      return {
+        provider: 'openai',
+        modelName: openaiModel.id,
+        maxTokens: openaiModel.maxTokens,
+        systemPrompt: openaiModel.systemPrompt,
+        evaluationPrompt: openaiModel.evaluationPrompt
+      };
+    }
+    default:
+      throw new Error(`Unsupported provider: ${provider}`);
+  }
+}
+
+/**
+ * Evaluate a screenshot with Claude
+ * 
+ * @param screenshotBase64 Base64-encoded screenshot
+ * @param promptText The original prompt text
+ * @param criteriaText Formatted validation criteria
+ * @param modelConfig The Claude model configuration
+ * @returns Evaluation result
+ */
+async function evaluateWithClaude(
+  screenshotBase64: string,
+  promptText: string,
+  criteriaText: string,
+  modelConfig: ModelConfig
+): Promise<EvaluationResult> {
+  // Check if ANTHROPIC_API_KEY is set
+  if (!process.env.ANTHROPIC_API_KEY) {
+    throw new Error('ANTHROPIC_API_KEY environment variable is not set');
+  }
+  
+  // Initialize Anthropic client
+  const anthropic = new Anthropic({
+    apiKey: process.env.ANTHROPIC_API_KEY,
+  });
+  
+  // Prepare the prompt for Claude using the template from config
+  const claudeModel = config.models.find(model => model.provider === 'anthropic');
+  const claudePrompt = (modelConfig.evaluationPrompt || (claudeModel?.evaluationPrompt || ''))
+    .replace('{promptText}', promptText)
+    .replace('{criteriaText}', criteriaText);
+  
+  try {
+    const message = await anthropic.messages.create({
+      model: modelConfig.modelName,
+      max_tokens: modelConfig.maxTokens || (claudeModel?.maxTokens || 1000),
+      system: modelConfig.systemPrompt || (claudeModel?.systemPrompt || ''),
+      messages: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: claudePrompt
+            },
+            {
+              type: "image",
+              source: {
+                type: "base64",
+                media_type: "image/png",
+                data: screenshotBase64
+              }
+            }
+          ]
+        }
+      ]
+    });
+    
+    // The content is an array of content blocks, we need to check the type
+    const contentBlock = message.content[0];
+    let evaluationText = '';
+    
+    // Check if the content block is of type 'text'
+    if (contentBlock.type === 'text') {
+      evaluationText = contentBlock.text;
+    } else {
+      evaluationText = JSON.stringify(contentBlock);
+    }
+    
+    return parseAIResponse(evaluationText);
+  } catch (error) {
+    console.error('Error evaluating with Claude:', error);
+    return { 
+      rating: 'UNKNOWN', 
+      explanation: `Error evaluating with Claude: ${error}`, 
+      validationStatus: 'FAILED',
+      criteriaEvaluations: [],
+      satisfactionPercentage: '0%',
+      confidence: 'LOW'
+    };
+  }
+}
+
+/**
+ * Evaluate a screenshot with OpenAI
+ * 
+ * @param screenshotBase64 Base64-encoded screenshot
+ * @param promptText The original prompt text
+ * @param criteriaText Formatted validation criteria
+ * @param modelConfig The OpenAI model configuration
+ * @returns Evaluation result
+ */
+async function evaluateWithOpenAI(
+  screenshotBase64: string,
+  promptText: string,
+  criteriaText: string,
+  modelConfig: ModelConfig
+): Promise<EvaluationResult> {
+  // Check if OPENAI_API_KEY is set
+  if (!process.env.OPENAI_API_KEY) {
+    throw new Error('OPENAI_API_KEY environment variable is not set');
+  }
+  
+  // Initialize OpenAI client
+  const openai = new OpenAI({
+    apiKey: process.env.OPENAI_API_KEY,
+  });
+  
+  // Prepare the prompt for OpenAI using the template from config
+  const openaiModel = config.models.find(model => model.provider === 'openai');
+  const openaiPrompt = (modelConfig.evaluationPrompt || (openaiModel?.evaluationPrompt || ''))
+    .replace('{promptText}', promptText)
+    .replace('{criteriaText}', criteriaText);
+  
+  try {
+    const response = await openai.chat.completions.create({
+      model: modelConfig.modelName,
+      messages: [
+        {
+          role: "system",
+          content: modelConfig.systemPrompt || (openaiModel?.systemPrompt || '')
+        },
+        {
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: openaiPrompt
+            },
+            {
+              type: "image_url",
+              image_url: {
+                url: `data:image/png;base64,${screenshotBase64}`,
+                detail: "high"
+              }
+            }
+          ]
+        }
+      ],
+      max_tokens: modelConfig.maxTokens || (openaiModel?.maxTokens || 1000)
+    });
+    
+    const evaluationText = response.choices[0]?.message?.content || '';
+    return parseAIResponse(evaluationText);
+  } catch (error) {
+    console.error('Error evaluating with OpenAI:', error);
+    return { 
+      rating: 'UNKNOWN', 
+      explanation: `Error evaluating with OpenAI: ${error}`, 
+      validationStatus: 'FAILED',
+      criteriaEvaluations: [],
+      satisfactionPercentage: '0%',
+      confidence: 'LOW'
+    };
+  }
+}
+
+/**
+ * Parse the AI response text into a structured evaluation result
+ * 
+ * @param evaluationText The raw text response from the AI model
+ * @returns Evaluation result with validation status
+ */
+function parseAIResponse(evaluationText: string): EvaluationResult {
+  try {
+    // Find JSON in the response text
+    let jsonText = evaluationText;
+    const jsonMatch = evaluationText.match(/(\{[\s\S]*\})/);
+    if (jsonMatch) {
+      jsonText = jsonMatch[0];
+    }
+
+    // Parse the JSON
+    const parsedJson = JSON.parse(jsonText);
+    
+    // For our simplified format, we'll just check if the required fields exist
+    if (parsedJson.criteria_evaluations && parsedJson.overall_satisfaction && parsedJson.explanation) {
+      // Return the validated data with a PASSED status
+      
+      // Use the overall satisfaction directly from the response
+      const satisfactionScore = parsedJson.overall_satisfaction;
+      
+      // For backwards compatibility, derive a rating from the satisfaction score
+      let rating = 'RED';
+      if (satisfactionScore >= 80) {
+        rating = 'GREEN';
+      } else if (satisfactionScore >= 50) {
+        rating = 'YELLOW';
+      }
+      
+      return {
+        rating: rating,
+        explanation: parsedJson.explanation,
+        validationStatus: 'PASSED',
+        criteriaEvaluations: parsedJson.criteria_evaluations,
+        satisfactionPercentage: satisfactionScore.toFixed(1) + '%',
+        confidence: 'HIGH'  // We're no longer using confidence levels
+      };
+    } else {
+      // Try to extract just the satisfaction and explanation
+      const fallbackSatisfaction = parsedJson.overall_satisfaction || 0;
+      const fallbackExplanation = parsedJson.explanation || 'No explanation provided';
+      
+      // For backwards compatibility, derive a rating from the satisfaction score
+      let fallbackRating = 'RED';
+      if (fallbackSatisfaction >= 80) {
+        fallbackRating = 'GREEN';
+      } else if (fallbackSatisfaction >= 50) {
+        fallbackRating = 'YELLOW';
+      }
+      
+      console.warn('JSON validation failed. Missing required fields.');
+      console.warn('Attempting fallback parsing with just satisfaction and explanation');
+      
+      return {
+        rating: fallbackRating,
+        explanation: fallbackExplanation,
+        validationStatus: 'PARTIAL',
+        criteriaEvaluations: [],
+        satisfactionPercentage: fallbackSatisfaction.toFixed(1) + '%',
+        confidence: 'LOW'
+      };
+    }
+  } catch (error) {
+    // Handle JSON parsing errors
+    console.error('Error parsing AI response:', error);
+    return {
+      rating: 'UNKNOWN',
+      explanation: `Error parsing AI response: ${error}. Received: ${evaluationText}`,
+      validationStatus: 'FAILED',
+      criteriaEvaluations: [],
+      satisfactionPercentage: '0%',
+      confidence: 'LOW'
+    };
+  }
+} 
\ No newline at end of file
diff --git a/quadratic-ai-eval/tests/config.ts b/quadratic-ai-eval/tests/config.ts
new file mode 100644
index 0000000000..8811e28408
--- /dev/null
+++ b/quadratic-ai-eval/tests/config.ts
@@ -0,0 +1,119 @@
+/**
+ * Configuration for the Quadratic AI Evaluation tests
+ */
+
+export const config = {
+  // Authentication
+  auth: {
+    email: process.env.AUTH_EMAIL || '',
+    password: process.env.AUTH_PASSWORD || '',
+    loginUrl: 'https://qa.quadratic-preview.com/',
+    redirectUrl: /teams/
+  },
+  
+  // Timeouts (in milliseconds)
+  timeouts: {
+    navigation: 30000,
+    elementVisibility: 15000,
+    generation: 300000 // 5 minutes max for AI generation
+  },
+  
+  // Models configuration - enable as many as needed
+  models: [
+    {
+      provider: 'anthropic',
+      id: 'claude-3-7-sonnet-20250219',
+      maxTokens: 1000,
+      temperature: 0.2,
+      systemPrompt: `You are an expert evaluator of data visualizations with extensive experience in spreadsheet analysis.
+Your task is to objectively evaluate if a spreadsheet visualization meets the requirements of a prompt.
+Always respond with valid JSON in the exact format requested, without any preamble, explanations outside the JSON, or markdown formatting.
+Focus exclusively on what is visible in the image provided, not what you think should be there.`,
+      evaluationPrompt: `
+I'm showing you a screenshot of a Quadratic spreadsheet that was generated from the prompt: "{promptText}".
+
+Carefully analyze the image and evaluate if the result correctly implements what was requested in the prompt.
+
+Specific criteria to evaluate:
+{criteriaText}
+
+For each criterion, provide a satisfaction score from 0-100, where:
+- 0: The criterion is not met at all
+- 50: The criterion is partially met
+- 100: The criterion is fully met
+
+Provide your evaluation in this exact JSON format:
+{
+  "criteria_evaluations": [
+    {"criterion": "Criterion 1", "satisfaction_score": 85, "explanation": "Brief explanation"},
+    {"criterion": "Criterion 2", "satisfaction_score": 50, "explanation": "Brief explanation"},
+    ...
+  ],
+  "overall_satisfaction": 75,
+  "explanation": "Your detailed explanation summarizing your evaluation"
+}
+
+Your response must be valid JSON that can be parsed programmatically.
+`
+    },
+    {
+      provider: 'openai',
+      id: 'gpt-4o',
+      maxTokens: 1000,
+      temperature: 0.2,
+      systemPrompt: `You are an expert evaluator of data visualizations with extensive experience in spreadsheet analysis.
+Your task is to objectively evaluate if a spreadsheet visualization meets the requirements of a prompt.
+Always respond with valid JSON in the exact format requested, without any preamble, explanations outside the JSON, or markdown formatting.
+Focus exclusively on what is visible in the image provided, not what you think should be there.`,
+      evaluationPrompt: `
+I'm showing you a screenshot of a Quadratic spreadsheet that was generated from the prompt: "{promptText}".
+
+Carefully analyze the image and evaluate if the result correctly implements what was requested in the prompt.
+
+Specific criteria to evaluate:
+{criteriaText}
+
+For each criterion, provide a satisfaction score from 0-100, where:
+- 0: The criterion is not met at all
+- 50: The criterion is partially met
+- 100: The criterion is fully met
+
+Provide your evaluation in this exact JSON format:
+{
+  "criteria_evaluations": [
+    {"criterion": "Criterion 1", "satisfaction_score": 85, "explanation": "Brief explanation"},
+    {"criterion": "Criterion 2", "satisfaction_score": 50, "explanation": "Brief explanation"},
+    ...
+  ],
+  "overall_satisfaction": 75,
+  "explanation": "Your detailed explanation summarizing your evaluation"
+}
+
+Your response must be valid JSON that can be parsed programmatically.
+`
+    }
+  ],
+  
+  // Test execution
+  execution: {
+    // Set to true to run tests in parallel, false to run sequentially
+    parallel: true,
+    // Maximum number of parallel tests (if parallel is true)
+    maxWorkers: 4,
+    // Simple scoring configuration
+    scoring: {
+      // Enable scoring
+      enabled: true,
+      // Threshold for passing (0.0 to 100.0) - test passes if average satisfaction score >= threshold
+      passThreshold: 80
+    }
+  },
+  
+  // URLs
+  urls: {
+    baseUrl: 'https://qa.quadratic-preview.com',
+    createFileWithPrompt: '/files/create?prompt='
+  }
+};
+
+export default config; 
\ No newline at end of file
diff --git a/quadratic-ai-eval/tests/evaluation-scoring.ts b/quadratic-ai-eval/tests/evaluation-scoring.ts
new file mode 100644
index 0000000000..fd593d2ff3
--- /dev/null
+++ b/quadratic-ai-eval/tests/evaluation-scoring.ts
@@ -0,0 +1,49 @@
+/**
+ * Evaluation Scoring Utilities
+ * 
+ * This file contains functions for calculating aggregate scores based on AI model evaluations.
+ */
+
+import type { EvaluationResult } from './ai-evaluators';
+import config from './config';
+
+/**
+ * Calculates an aggregate satisfaction score from multiple model evaluations
+ * 
+ * @param evaluations Array of evaluation results from different models
+ * @returns A score between 0.0 and 100.0 representing overall satisfaction
+ */
+export function calculateAggregateScore(evaluations: EvaluationResult[]): number {
+  if (!evaluations.length) return 0;
+  
+  // Extract the overall satisfaction scores from each evaluation
+  const satisfactionScores = evaluations.map(evaluation => {
+    // Parse the satisfaction percentage from string to number
+    const matchResult = evaluation.satisfactionPercentage?.match(/(\d+(\.\d+)?)/);
+    return matchResult ? parseFloat(matchResult[0]) : 0;
+  });
+  
+  // Calculate the average satisfaction score
+  const totalScore = satisfactionScores.reduce((sum, score) => sum + score, 0);
+  return totalScore / satisfactionScores.length;
+}
+
+/**
+ * Determines if the test passes based on the aggregate score
+ * 
+ * @param aggregateScore The calculated aggregate score
+ * @returns Boolean indicating if the test passes
+ */
+export function doesPassThreshold(aggregateScore: number): boolean {
+  return aggregateScore >= config.execution.scoring.passThreshold;
+}
+
+/**
+ * Formats the aggregate score as a percentage string
+ * 
+ * @param aggregateScore The calculated aggregate score
+ * @returns Formatted percentage string (e.g. "95.0%")
+ */
+export function formatScorePercentage(aggregateScore: number): string {
+  return aggregateScore.toFixed(1) + '%';
+} 
\ No newline at end of file
diff --git a/quadratic-ai-eval/tests/prompt-evaluation.spec.ts b/quadratic-ai-eval/tests/prompt-evaluation.spec.ts
new file mode 100644
index 0000000000..cb11281c72
--- /dev/null
+++ b/quadratic-ai-eval/tests/prompt-evaluation.spec.ts
@@ -0,0 +1,280 @@
+import { expect, test } from '@playwright/test';
+import * as fs from 'fs';
+import * as path from 'path';
+import { aiEval, EvaluationResult, ModelConfig } from './ai-evaluators';
+import config from './config';
+import { calculateAggregateScore, doesPassThreshold, formatScorePercentage } from './evaluation-scoring';
+import { testPrompts } from './prompt-tests';
+
+// Helper function to handle login
+async function login(page: any) {
+  // Navigate to the homepage
+  await page.goto(config.auth.loginUrl);
+  
+  // Wait for Auth0 login page to load
+  await page.waitForSelector('input[name="username"]', { timeout: config.timeouts.navigation });
+  
+  // Fill in Auth0 login form
+  const emailInput = page.locator('input[name="username"]');
+  const passwordInput = page.locator('input[name="password"]');
+  
+  await expect(emailInput).toBeVisible({ timeout: config.timeouts.elementVisibility });
+  await emailInput.fill(config.auth.email);
+  
+  await expect(passwordInput).toBeVisible({ timeout: config.timeouts.elementVisibility });
+  await passwordInput.fill(config.auth.password);
+  
+  // press enter
+  await page.keyboard.press('Enter');
+  
+  // Wait for redirect back to Quadratic after successful login
+  await page.waitForURL(config.auth.redirectUrl, { timeout: config.timeouts.navigation });
+}
+
+// Helper function to format criteria evaluations for display
+function formatCriteriaEvaluations(criteriaEvaluations) {
+  if (!criteriaEvaluations || criteriaEvaluations.length === 0) {
+    return 'No criteria evaluations available';
+  }
+  
+  return criteriaEvaluations.map((ce, idx) => 
+    `${idx + 1}. ${ce.criterion}: ${ce.met} - ${ce.explanation}`
+  ).join('\n');
+}
+
+// Run a single consensus test for each prompt
+test.describe('Quadratic AI Prompt Tests', () => {
+  for (const promptTest of testPrompts) {
+    test(promptTest.name, async ({ page }) => {
+      // Check if we have at least one API key
+      const hasClaudeKey = !!process.env.ANTHROPIC_API_KEY;
+      const hasOpenAIKey = !!process.env.OPENAI_API_KEY;
+      
+      if (!hasClaudeKey && !hasOpenAIKey) {
+        test.skip(true, 'Neither ANTHROPIC_API_KEY nor OPENAI_API_KEY environment variables are set. Skipping evaluation.');
+        return;
+      }
+      
+      // Collect the model configs we can use based on available API keys
+      const modelConfigs: ModelConfig[] = [];
+      
+      if (hasClaudeKey) {
+        const claudeModel = config.models.find(model => model.provider === 'anthropic');
+        if (claudeModel) {
+          modelConfigs.push({
+            provider: 'claude',
+            modelName: claudeModel.id,
+            maxTokens: claudeModel.maxTokens
+          });
+        }
+      }
+      
+      if (hasOpenAIKey) {
+        const openaiModel = config.models.find(model => model.provider === 'openai');
+        if (openaiModel) {
+          modelConfigs.push({
+            provider: 'openai',
+            modelName: openaiModel.id,
+            maxTokens: openaiModel.maxTokens
+          });
+        }
+      }
+      
+      // Navigate and wait for the result to be generated
+      await login(page);
+      await page.goto(config.urls.createFileWithPrompt + encodeURIComponent(promptTest.prompt));
+
+      // Wait for the cancel button to disappear (if it was present)
+      await page.waitForSelector('button:has-text("Cancel generating")', { timeout: config.timeouts.elementVisibility });
+      await page.waitForSelector('button:has-text("Cancel generating")', { state: 'hidden', timeout: config.timeouts.generation });
+      
+      // Take a screenshot for AI evaluation
+      const testResultsDir = path.join(__dirname, '../test-results');
+      if (!fs.existsSync(testResultsDir)) {
+        fs.mkdirSync(testResultsDir, { recursive: true });
+      }
+      
+      const timestamp = Date.now();
+      const screenshotPath = path.join(testResultsDir, `${promptTest.name.replace(/\s+/g, '-')}-${timestamp}.png`);
+      
+      const canvasElement = page.locator('#QuadraticCanvasID');
+      await expect(canvasElement).toBeVisible({ timeout: config.timeouts.elementVisibility });
+      await canvasElement.screenshot({ path: screenshotPath });
+      
+      // Attach the screenshot to the test report
+      await test.info().attach(`${promptTest.name}-result.png`, {
+        path: screenshotPath,
+        contentType: 'image/png'
+      });
+      
+      // If we only have one model, just use that for evaluation
+      if (modelConfigs.length === 1) {
+        const modelConfig = modelConfigs[0];
+        const result = await aiEval(modelConfig, screenshotPath, promptTest);
+        
+        // Get model info for reporting
+        const providerName = modelConfig.provider;
+        const modelName = modelConfig.modelName;
+        
+        // Format the evaluation result for test output
+        const evaluationSummary = `
+=== SINGLE MODEL EVALUATION RESULTS ===
+Test: ${promptTest.name}
+Prompt: "${promptTest.prompt}"
+Provider: ${providerName}
+Model: ${modelName}
+Rating: ${result.rating}
+Validation: ${result.validationStatus}
+Confidence: ${result.confidence}
+Satisfaction: ${result.satisfactionPercentage}
+---
+${result.explanation}
+---
+${formatCriteriaEvaluations(result.criteriaEvaluations)}
+===========================
+`;
+        
+        // Add the evaluation to the test report
+        test.info().annotations.push({
+          type: 'AI Evaluation',
+          description: evaluationSummary
+        });
+        
+        // Assertions based on the rating and validation status
+        expect(result.validationStatus, 'AI response validation failed').toBe('PASSED');
+        
+        if (result.validationStatus === 'PASSED' && promptTest.expectedRating) {
+          // Only assert on rating if validation passed and expected rating is specified
+          expect(result.rating, `AI evaluation indicates issues with the result for "${promptTest.name}"`).toBe(promptTest.expectedRating);
+        }
+        
+        return; // End test early if only one model
+      }
+      
+      // With multiple models, run consensus evaluation
+      // Collect evaluations from each model
+      const evaluations: EvaluationResult[] = [];
+      for (const modelConfig of modelConfigs) {
+        try {
+          const result = await aiEval(modelConfig, screenshotPath, promptTest);
+          
+          // Add individual model results to the test report
+          const individualSummary = `
+=== ${modelConfig.provider.toUpperCase()} EVALUATION ===
+Rating: ${result.rating}
+Confidence: ${result.confidence}
+Satisfaction: ${result.satisfactionPercentage}
+---
+${formatCriteriaEvaluations(result.criteriaEvaluations)}
+`;
+          
+          test.info().annotations.push({
+            type: `${modelConfig.provider.charAt(0).toUpperCase() + modelConfig.provider.slice(1)} Evaluation`,
+            description: individualSummary
+          });
+          
+          evaluations.push(result);
+        } catch (error) {
+          console.error(`Error evaluating with ${modelConfig.provider}:`, error);
+        }
+      }
+      
+      // Skip if we didn't get at least 2 successful evaluations when we have 2+ models
+      if (evaluations.length < 2 && modelConfigs.length >= 2) {
+        test.skip(true, `Not enough successful evaluations for consensus (got ${evaluations.length}, need at least 2)`);
+        return;
+      }
+      
+      // Collect ratings and confidences
+      const ratings = evaluations.map(e => e.rating);
+      const confidences = evaluations.map(e => e.confidence);
+      
+      // Check for consensus with confidence weighting if enabled
+      let majorityRating = '';
+      let maxWeight = 0;
+      
+      // Simple counting without weighting
+      const counts: Record<string, number> = {};
+      ratings.forEach(rating => {
+        counts[rating] = (counts[rating] || 0) + 1;
+      });
+      
+      // Find the majority rating
+      let maxCount = 0;
+      Object.entries(counts).forEach(([rating, count]) => {
+        if (count > maxCount) {
+          majorityRating = rating;
+          maxCount = count;
+        }
+      });
+      
+      // Calculate agreement percentage
+      var agreementPercentage = maxCount / ratings.length;
+      
+      // Create consensus result
+      const consensusResult = {
+        rating: majorityRating,
+        agreementPercentage: (agreementPercentage * 100).toFixed(1) + '%',
+        ratings: ratings.join(', '),
+        confidences: confidences.join(', '),
+        models: modelConfigs.map(mc => `${mc.provider}:${mc.modelName}`).join(', '),
+        expectedRating: promptTest.expectedRating || 'Not specified'
+      };
+      
+      // Calculate aggregate satisfaction score if enabled
+      let aggregateScore = 0;
+      let passesThreshold = false;
+      let scorePercentage = '0.0%';
+      
+      if (config.execution.scoring?.enabled) {
+        aggregateScore = calculateAggregateScore(evaluations);
+        passesThreshold = doesPassThreshold(aggregateScore);
+        scorePercentage = formatScorePercentage(aggregateScore);
+        
+        // Add score information to consensus result
+        Object.assign(consensusResult, {
+          aggregateScore,
+          scorePercentage,
+          passesThreshold
+        });
+      }
+      
+      // Format consensus result for test output
+      const consensusSummary = `
+=== CONSENSUS EVALUATION RESULTS ===
+Test: ${promptTest.name}
+Prompt: "${promptTest.prompt}"
+Models used: ${consensusResult.models}
+Individual ratings: ${consensusResult.ratings}
+Individual confidences: ${consensusResult.confidences}
+Majority rating: ${consensusResult.rating}
+${config.execution.scoring?.enabled ? `Aggregate satisfaction score: ${scorePercentage}
+Passes threshold (${config.execution.scoring.passThreshold}%): ${passesThreshold ? 'YES' : 'NO'}` : ''}
+Expected rating: ${consensusResult.expectedRating}
+===========================
+`;
+      
+      // Add the consensus to the test report
+      test.info().annotations.push({
+        type: 'Consensus Evaluation',
+        description: consensusSummary
+      });
+      
+      // Use the first evaluation's detailed explanation
+      const detailedExplanation = evaluations[0].explanation;
+      test.info().annotations.push({
+        type: 'Detailed Explanation',
+        description: detailedExplanation
+      });
+      
+      // Assertions when we have multiple models
+      if (modelConfigs.length >= 2) {
+        // Only check aggregate score threshold
+        expect(
+          passesThreshold,
+          `Aggregate satisfaction score (${scorePercentage}) is below threshold (${config.execution.scoring.passThreshold}%)`
+        ).toBe(true);
+      }
+    });
+  }
+}); 
\ No newline at end of file
diff --git a/quadratic-ai-eval/tests/prompt-tests.ts b/quadratic-ai-eval/tests/prompt-tests.ts
new file mode 100644
index 0000000000..52a4f6690a
--- /dev/null
+++ b/quadratic-ai-eval/tests/prompt-tests.ts
@@ -0,0 +1,89 @@
+// Define the schema for test prompts
+export interface PromptTest {
+  prompt: string;
+  name: string;
+  validationCriteria: string[];
+  expectedRating?: 'GREEN' | 'YELLOW' | 'RED';
+}
+
+// Define the test prompts and their validation criteria
+export const testPrompts: PromptTest[] = [
+  {
+    name: 'States GDP Map',
+    prompt: 'Return a table of states with gdp per capita, and plot it on a map, use state abbreviations for the map',
+    validationCriteria: [
+      'Does the spreadsheet contain a table of states with GDP per capita?',
+      'Is there a map visualization of this data?',
+      'Is the data properly formatted and presented?',
+      'Are there any obvious errors or issues?',
+      'Are the states on the map correctly colored?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'First 500 Prime Numbers',
+    prompt: 'return the first 500 prime numbers in a list',
+    validationCriteria: [
+      'Does the spreadsheet contain a table of the first 500 prime numbers? It\'s ok if it\'s cut off and all the numbers are not visible.',
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'Stock Price Trends',
+    prompt: 'Show me the stock price trends for AAPL, MSFT, GOOG, and AMZN over the past 5 years with a line chart.',
+    validationCriteria: [
+      'Does the spreadsheet contain stock price data for AAPL, MSFT, GOOG, and AMZN?',
+      'Is there a line chart visualization showing the trends?',
+      'Does the chart cover approximately a 5-year period?',
+      'Are the lines properly labeled or is there a legend?',
+      'Is the data properly formatted and presented?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'COVID-19 Cases by Country',
+    prompt: 'Create a bar chart showing COVID-19 cases by country for the top 10 most affected countries.',
+    validationCriteria: [
+      'Does the spreadsheet contain COVID-19 case data for countries?',
+      'Is there a bar chart visualization showing the top 10 countries?',
+      'Are the countries sorted by number of cases?',
+      'Are the bars properly labeled?',
+      'Is the data properly formatted and presented?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'Basic Chart Creation',
+    prompt: 'Insert a line chart with the x axis as dates and y axis as number of sales, first generate sample data for that chart',
+    validationCriteria: [
+      'Does the spreadsheet contain a dataset of a time series of date and sales values?',
+      'Is there a line chart visualization showcasing the sales over time?',
+      'Is the data properly formatted and presented?',
+      'Are there any obvious errors or issues?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'DataFrame Manipulation',
+    prompt: 'Insert the following dataset and then use python to reference the data and make it cleaner/more readable, then display the cleaned dataset\nProduct Name\tPrice\tIn Stock\tDate Added\tRating\nlaptop pro 15"\t1299.99\tYes\t2023-01-15\t4.7/5\nSMARTPHONE X\t899\tNO\t2023-01-22\t3.9 stars\nwireless headphones\t129.95\tyes\t2023-02-05\t4.2/5\n49.99\tYes\t2023-01-30\t4.0 stars\nGAMING MOUSE\t79.99\t\t2023-02-15\t4.5/5\nexternal SSD 1TB\t159.99\tYES\t2023-03-01\t\nmechanical keyboard\t149.95\tno\t2023-03-10\t4.8 stars\nmonitor 27"\t249.99\tYes\t2023-02-20\t4.3/5',
+    validationCriteria: [
+      'Does the spreadsheet show the original dataset?',
+      'Is there a cleaned version of the dataset displayed?',
+      'Is the cleaned data more legible and easier to work with?',
+      'Are there any obvious errors or issues?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  {
+    name: 'DataFrame Star Rating Count',
+    prompt: 'Insert the following dataset and then count the number of items that have 4 stars or greater; display the answer as "Number: " and the answer\nProduct Name\tPrice\tIn Stock\tDate Added\tRating\nlaptop pro 15"\t1299.99\tYes\t2023-01-15\t4.7/5\nSMARTPHONE X\t899\tNO\t2023-01-22\t3.9 stars\nwireless headphones\t129.95\tyes\t2023-02-05\t4.2/5\n49.99\tYes\t2023-01-30\t4.0 stars\nGAMING MOUSE\t79.99\t\t2023-02-15\t4.5/5\nexternal SSD 1TB\t159.99\tYES\t2023-03-01\t\nmechanical keyboard\t149.95\tno\t2023-03-10\t4.8 stars\nmonitor 27"\t249.99\tYes\t2023-02-20\t4.3/5',
+    validationCriteria: [
+      'Does the spreadsheet show the dataset?',
+      'Is the correct count of items with 4 stars or greater displayed as "Number: 6"?',
+      'Is the data properly formatted and presented?',
+      'Are there any obvious errors or issues?'
+    ],
+    expectedRating: 'GREEN'
+  },
+  // Add more test prompts as needed
+];