quadratichq · davidkircos · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/README.md b/README.md
@@ -26,3 +26,70 @@ Want to contribute? Read our [Contributing Guide](./CONTRIBUTING.md)
 ## Quadratic is hiring
 
 Check out our open roles ⟶ [careers.quadratichq.com](https://careers.quadratichq.com)
+
+# Quadratic QA Test Suite
+
+This repository contains automated tests for the Quadratic application using Playwright.
+
+## Prerequisites
+
+- Node.js (v14 or newer)
+- npm
+
+## Setup
+
+1. Install dependencies:
+   ```
+   npm install
+   ```
+
+2. Install Playwright browsers:
+   ```
+   npx playwright install
+   ```
+
+## Running Tests
+
+Run all tests:
+```
+npm test
+```
+
+Run tests with UI mode (for debugging and development):
+```
+npm run test:ui
+```
+
+Run tests in headed mode (with visible browser):
+```
+npm run test:headed
+```
+
+Run tests in debug mode:
+```
+npm run test:debug
+```
+
+View the HTML report after a test run:
+```
+npm run report
+```
+
+## Test Structure
+
+- `tests/homepage.spec.ts` - Basic tests for the homepage
+- `tests/login.spec.ts` - Tests for login functionality
+- `tests/spreadsheet.spec.ts` - Tests for spreadsheet functionality
+
+## Notes
+
+- These tests are designed for the Quadratic application at `app.quadratichq.com`
+- The selectors in the tests may need to be updated based on the actual UI
+- Screenshots are saved in the project root directory
+
+## Adding New Tests
+
+1. Create a new file in the `tests` directory with a `.spec.ts` extension
+2. Import the necessary Playwright modules
+3. Write your tests using the Playwright API
+4. Run the tests to verify they work as expected
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -84,6 +84,7 @@
     "chalk": "^5.3.0",
     "commander": "^11.1.0",
     "concurrently": "^6.5.1",
+    "dotenv": "^16.4.7",
     "eslint": "^8.57.0",
     "jest": "^29.6.1",
     "kill-port": "^2.0.1",

diff --git a/quadratic-ai-eval/.gitignore b/quadratic-ai-eval/.gitignore
@@ -0,0 +1,50 @@
+# Playwright specific
+/test-results/
+/playwright-report/
+/blob-report/
+/playwright/.cache/
+/html-report/
+/allure-results/
+/allure-report/
+
+# Node.js dependencies
+/node_modules/
+/package-lock.json
+/yarn.lock
+/pnpm-lock.yaml
+
+# Environment variables
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+
+# IDE specific files
+.idea/
+.vscode/
+*.code-workspace
+.DS_Store
+
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# Coverage directory
+/coverage/
+
+# Temporary files
+/tmp/
+/temp/
+
+# Screenshots and videos from test runs
+/screenshots/
+/videos/
+
+# User-specific files
+*.user
+*.suo
+*.userprefs 
diff --git a/quadratic-ai-eval/README.md b/quadratic-ai-eval/README.md
@@ -0,0 +1,121 @@
+# Quadratic AI Evaluation Framework
+
+This framework allows you to test Quadratic's AI capabilities by running multiple prompts in parallel and evaluating the results using Claude.
+
+## Setup
+
+1. Install dependencies:
+```bash
+npm install
+```
+
+2. Set up environment variables:
+```bash
+# Create a .env file with your Anthropic API key
+echo "ANTHROPIC_API_KEY=your_api_key_here" > .env
+```
+
+## Running Tests
+
+To run all tests:
+```bash
+npx playwright test
+```
+
+To run the prompt evaluation tests specifically:
+```bash
+npx playwright test prompt-evaluation.spec.ts
+# or use the npm script
+npm run test:prompt
+```
+
+To run tests with a UI:
+```bash
+npx playwright test --ui
+```
+
+## Test Categories
+
+The framework includes different categories of tests:
+
+- **Basic prompts**: Simple data visualization prompts (run with `npm run test:basic`)
+- **Complex data prompts**: More complex data analysis prompts (run with `npm run test:complex`)
+- **Parallel execution**: Run all tests in parallel (run with `npm run test:parallel`)
+
+## Adding New Tests
+
+To add new test prompts, edit the `tests/prompt-tests.ts` file and add your prompts to the appropriate array:
+
+```typescript
+// For basic data visualization prompts
+export const testPrompts: PromptTest[] = [
+  {
+    name: 'Your Test Name',
+    prompt: 'Your prompt text here',
+    validationCriteria: [
+      'Criterion 1?',
+      'Criterion 2?',
+      'Criterion 3?',
+      'Criterion 4?',
+      'Criterion 5?'
+    ],
+    expectedRating: 'GREEN' // or 'YELLOW' or 'RED'
+  },
+  // Add more test prompts here
+];
+
+// For more complex data analysis prompts
+export const complexDataPrompts: PromptTest[] = [
+  // Add your complex data prompts here
+];
+```
+
+### Test Structure
+
+Each test prompt consists of:
+
+- `name`: A descriptive name for the test
+- `prompt`: The actual prompt to send to Quadratic
+- `validationCriteria`: An array of questions that Claude will use to evaluate the result
+- `expectedRating`: The expected rating from Claude (GREEN, YELLOW, or RED)
+
+## How It Works
+
+1. The test framework logs in to Quadratic using Auth0 credentials
+2. For each prompt in the test arrays:
+   - It navigates to the file creation page with the prompt
+   - Waits for the spreadsheet to be generated
+   - Takes a screenshot of the result
+   - Sends the screenshot to Claude for evaluation
+   - Validates that the result meets the expected criteria
+
+## Evaluation Criteria
+
+Claude evaluates each result and provides a rating:
+
+- **GREEN**: The result looks correct and fully satisfies the prompt requirements
+- **YELLOW**: The result partially satisfies the prompt but has minor issues
+- **RED**: The result is incorrect or has major issues
+
+## Test Reports
+
+Test reports are generated in the `playwright-report` directory. Each test includes:
+
+- The prompt text
+- A screenshot of the result
+- Claude's evaluation
+- The test status (passed/failed)
+
+## Customizing Tests
+
+You can customize the test framework by:
+
+1. Adding new prompt collections in `prompt-tests.ts`
+2. Modifying the evaluation criteria
+3. Adjusting timeouts and other parameters in `config.ts`
+
+## Troubleshooting
+
+- If tests fail with authentication errors, check your Auth0 credentials
+- If Claude evaluation fails, check your Anthropic API key
+- If tests time out, you may need to increase the timeout values in the config file 
diff --git a/quadratic-ai-eval/package.json b/quadratic-ai-eval/package.json
@@ -0,0 +1,35 @@
+{
+  "name": "quadratic-ai-eval",
+  "version": "1.0.0",
+  "description": "Playwright test suite for Quadratic with Auth0 authentication",
+  "main": "index.js",
+  "scripts": {
+    "test": "playwright test",
+    "test:ui": "playwright test --ui",
+    "test:headed": "playwright test --headed",
+    "test:debug": "playwright test --debug",
+    "test:dev": "playwright test --headed --debug-brk --timeout=0",
+    "test:basic": "playwright test -g 'Testing basic prompt'",
+    "test:complex": "playwright test -g 'Testing complex data prompt'",
+    "test:parallel": "playwright test --workers=4",
+    "test:prompt": "playwright test prompt-evaluation.spec.ts",
+    "report": "playwright show-report"
+  },
+  "keywords": [
+    "quadratic",
+    "testing",
+    "playwright",
+    "automation",
+    "auth0"
+  ],
+  "author": "",
+  "license": "ISC",
+  "devDependencies": {
+    "@playwright/test": "^1.40.0",
+    "dotenv": "^16.4.7"
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.37.0",
+    "zod": "^3.24.2"
+  }
+}