browserbase · miguelg719 · Sep 23, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/.changeset/curly-boats-push.md b/.changeset/curly-boats-push.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": patch
+---
+
+improve evals screenshot service - add img hashing diff to add screenshots and change to screenshot intercepts from the agent
diff --git a/evals/cli.ts b/evals/cli.ts
@@ -381,7 +381,6 @@ function handleRun(args: string[]): void {
         webbench: "agent/webbench",
         gaia: "agent/gaia",
         webvoyager: "agent/webvoyager",
-        osworld: "agent/osworld",
         onlineMind2Web: "agent/onlineMind2Web",
       };
 

diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -19,7 +19,8 @@ import {
 import { LLMParsedResponse } from "@/lib/inference";
 import { LLMResponse } from "@/lib/llm/LLMClient";
 import { LogLine } from "@/types/log";
-import { z } from "zod";
+import { z } from "zod/v3";
+import { imageResize } from "./utils/imageUtils";
 
 dotenv.config();
 
@@ -292,17 +293,36 @@ export class Evaluator {
       this.modelClientOptions,
     );
 
-    const imageContents = screenshots.map((screenshot) => ({
+    //Downsize screenshots:
+    const downsizedScreenshots = await Promise.all(
+      screenshots.map(async (screenshot) => {
+        return await imageResize(screenshot, 0.7);
+      }),
+    );
+
+    const imageContents = downsizedScreenshots.map((screenshot) => ({
       type: "image_url" as const,
       image_url: {
-        url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
+        url: `data:image/png;base64,${screenshot.toString("base64")}`,
       },
     }));
 
+    this.stagehand.logger?.({
+      category: "evaluator",
+      message: `Evaluating question: ${question} with ${screenshots.length} screenshots`,
+      level: 2,
+      auxiliary: {
+        images: {
+          value: JSON.stringify(imageContents),
+          type: "object",
+        },
+      },
+    });
+
     const response = await llmClient.createChatCompletion<
       LLMParsedResponse<LLMResponse>
     >({
-      logger: this.silentLogger,
+      logger: this.stagehand.logger,
       options: {
         messages: [
           { role: "system", content: systemPrompt },

diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -300,6 +300,8 @@ const generateFilteredTestcases = (): Testcase[] => {
   const braintrustProjectName =
     process.env.CI === "true" ? "stagehand" : "stagehand-dev";
 
+  const startTime = Date.now();
+
   try {
     // Run the evaluations with the braintrust Eval function
     const evalResult = await Eval(braintrustProjectName, {
@@ -483,6 +485,9 @@ const generateFilteredTestcases = (): Testcase[] => {
 
     // Generate and write the summary
     await generateSummary(summaryResults, experimentName);
+    console.log(
+      `\n⌛️Evaluation completed in ${(Date.now() - startTime) / 1000}s\n`,
+    );
   } catch (error) {
     console.error("Error during evaluation run:", error);
     process.exit(1);

diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts
@@ -117,7 +117,6 @@ export const initStagehand = async ({
   } else {
     agentConfig = {
       model: modelName,
-      executionModel: "google/gemini-2.5-flash",
     } as AgentConfig;
   }
 

diff --git a/evals/package.json b/evals/package.json
@@ -12,7 +12,8 @@
     "e2e:local": "pnpm run build && playwright test --config deterministic/local.playwright.config.ts"
   },
   "dependencies": {
-    "@browserbasehq/stagehand": "workspace:*"
+    "@browserbasehq/stagehand": "workspace:*",
+    "sharp": "^0.33.5"
   },
   "devDependencies": {
     "@types/papaparse": "^5.3.16",

diff --git a/evals/tasks/agent/gaia.ts b/evals/tasks/agent/gaia.ts
@@ -1,6 +1,11 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+import { loadApiKeyFromEnv } from "@/lib/utils";
+import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
+import dotenv from "dotenv";
 
+dotenv.config();
 /**
  * Data-driven GAIA agent eval
  * - Expects per-test params injected via eval runner: { id, level, web, ques }
@@ -14,25 +19,20 @@ export const gaia: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   input,
-  agent,
+  modelName,
 }) => {
+  const startTime = Date.now();
+
   try {
     const params = ((input && input.params) || {}) as {
       id?: string;
       level?: number;
       web?: string;
       ques?: string;
+      expected?: string;
     };
 
     if (!params.web || !params.ques) {
-      logger.error({
-        category: "gaia",
-        level: 0,
-        message: `Missing GAIA params (web, ques).`,
-        auxiliary: {
-          params: { value: JSON.stringify(params), type: "object" },
-        },
-      });
       return {
         _success: false,
         error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`,
@@ -41,53 +41,74 @@ export const gaia: EvalFunction = async ({
         logs: logger.getLogs(),
       };
     }
-    await stagehand.page.goto(params.web);
 
-    const result = await agent.execute({
+    await stagehand.page.goto(params.web, {
+      timeout: 75_000,
+    });
+
+    const provider =
+      modelName in modelToAgentProviderMap
+        ? modelToAgentProviderMap[modelName]
+        : undefined;
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider,
+      instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
+      options: {
+        apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
+      },
+    });
+
+    // Start collecting screenshots with hybrid approach
+    const screenshotCollector = new ScreenshotCollector(stagehand.page, {
+      maxScreenshots: 8, // Keep last 8 screenshots
+    });
+
+    // Set the collector on the agent so it captures screenshots
+    if (agent.setScreenshotCollector) {
+      agent.setScreenshotCollector(screenshotCollector);
+    }
+
+    screenshotCollector.start();
+
+    const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
+    const agentResult = await agent.execute({
       instruction: params.ques,
-      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+      maxSteps,
+    });
+    // Stop collecting and get all screenshots
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
     });
 
-    const expected = (params as Record<string, unknown>).expected as
-      | string
-      | undefined;
+    const expected = params.expected;
     const evaluator = new Evaluator(stagehand);
     const evalResult = await evaluator.ask({
       question: `Did the agent provide the expected answer: "${expected}"?`,
-      answer: result?.message || "",
-      screenshot: false,
+      answer: agentResult.message || "",
+      screenshot: screenshots,
     });
 
     return {
       _success: evalResult.evaluation === "YES",
       reasoning: evalResult.reasoning,
       expectedAnswer: expected,
+      final_answer: agentResult?.message,
+      screenshotCount: screenshots.length,
+      task_level: params.level,
+      execution_time: Date.now() - startTime,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
   } catch (error) {
-    logger.error({
-      category: "gaia",
-      level: 0,
-      message: `Unhandled error in GAIA task`,
-      auxiliary: {
-        error: {
-          value: error instanceof Error ? error.message : String(error),
-          type: "string",
-        },
-        trace: {
-          value: error instanceof Error && error.stack ? error.stack : "",
-          type: "string",
-        },
-      },
-    });
-    return {
-      _success: false,
-      error,
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
+    // Let the error propagate - the parent runner will handle cleanup
+    console.error(error);
+    throw error;
   }
 };
diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts
@@ -1,6 +1,6 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
-import { z } from "zod";
+import { z } from "zod/v3";
 
 export const google_maps_2: EvalFunction = async ({
   debugUrl,

diff --git a/evals/tasks/agent/onlineMind2Web.ts b/evals/tasks/agent/onlineMind2Web.ts
@@ -1,18 +1,29 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
 import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
+import { loadApiKeyFromEnv } from "@/lib/utils";
 import dotenv from "dotenv";
-import fs from "fs";
-dotenv.config();
 
+dotenv.config();
+/**
+ * Data-driven OnlineMind2Web agent eval
+ * - Expects per-test params injected via eval runner: { task_id, confirmed_task, website, reference_length, level }
+ * - Starts at `website`, runs the agent with `confirmed_task` as instruction
+ * - Requires the agent to output a final answer in the form: "Final Answer: <value>"
+ * - Marks success if such an answer string is present (exact matching against dataset can be layered later)
+ * - Uses the evaluator to determine if the agent successfully completed the task
+ */
 export const onlineMind2Web: EvalFunction = async ({
   stagehand,
   logger,
   debugUrl,
   sessionUrl,
   input,
-  agent,
+  modelName,
 }) => {
+  const startTime = Date.now();
+
   try {
     const params = ((input && input.params) || {}) as {
       task_id?: string;
@@ -33,25 +44,42 @@ export const onlineMind2Web: EvalFunction = async ({
     }
 
     await stagehand.page.goto(params.website, {
-      timeout: 60_000,
+      timeout: 75_000,
     });
 
-    const screenshot = await stagehand.page.screenshot();
-    fs.writeFileSync("screenshot.png", screenshot);
+    const provider =
+      modelName in modelToAgentProviderMap
+        ? modelToAgentProviderMap[modelName]
+        : undefined;
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider,
+      instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
+      options: {
+        apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
+      },
+    });
 
     // Start collecting screenshots in parallel
     const screenshotCollector = new ScreenshotCollector(stagehand.page, {
-      maxScreenshots: 5, // Keep up to the last 5 screenshots
-      captureOnNavigation: true, // Also capture on page navigation
+      maxScreenshots: 8, // Keep up to the last 8 screenshots
     });
 
+    // Set the collector on the agent so it captures screenshots
+    if (agent.setScreenshotCollector) {
+      agent.setScreenshotCollector(screenshotCollector);
+    }
+
     screenshotCollector.start();
 
+    const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
     const agentResult = await agent.execute({
       instruction: params.confirmed_task,
-      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+      maxSteps,
     });
 
+    logger.log(agentResult);
     // Stop collecting and get all screenshots
     const screenshots = screenshotCollector.stop();
 
@@ -63,7 +91,7 @@ export const onlineMind2Web: EvalFunction = async ({
 
     const evaluator = new Evaluator(stagehand);
     const evalResult = await evaluator.ask({
-      question: `Did the agent successfully complete this task: "${params.confirmed_task}"?`,
+      question: `Did the agent successfully complete this task: "${params.confirmed_task}"? The task might be a bit outdated or impossible to complete, in those cases lean towards YES.`,
       screenshot: screenshots,
       agentReasoning:
         agentResult.message ||
@@ -73,19 +101,17 @@ export const onlineMind2Web: EvalFunction = async ({
     return {
       _success: evalResult.evaluation === "YES",
       reasoning: evalResult.reasoning,
-      // screenshotCount: screenshots.length,
+      final_answer: agentResult?.message,
+      screenshotCount: screenshots.length,
       task_level: params.level,
+      execution_time: Date.now() - startTime,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
   } catch (error) {
-    return {
-      _success: false,
-      error,
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
+    // Let the error propagate - the parent runner will handle cleanup
+    console.error(error);
+    throw error;
   }
 };