browserbase
diff --git a/‎CHANGELOG.md‎
Lines changed: 0 additions & 3 deletions b/‎CHANGELOG.md‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎evals/cli.ts‎
Lines changed: 0 additions & 1 deletion b/‎evals/cli.ts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎evals/initStagehand.ts‎
Lines changed: 0 additions & 1 deletion b/‎evals/initStagehand.ts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎evals/tasks/agent/gaia.ts‎
Lines changed: 44 additions & 50 deletions b/‎evals/tasks/agent/gaia.ts‎
Lines changed: 44 additions & 50 deletions
diff --git a/‎evals/tasks/agent/onlineMind2Web.ts‎
Lines changed: 11 additions & 4 deletions b/‎evals/tasks/agent/onlineMind2Web.ts‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎evals/tasks/agent/osworld.ts‎
Lines changed: 17 additions & 36 deletions b/‎evals/tasks/agent/osworld.ts‎
Lines changed: 17 additions & 36 deletions
diff --git a/‎evals/tasks/agent/webbench.ts‎
Lines changed: 15 additions & 40 deletions b/‎evals/tasks/agent/webbench.ts‎
Lines changed: 15 additions & 40 deletions
@@ -233,15 +233,13 @@
   We're thrilled to announce the release of Stagehand 2.0, bringing significant improvements to make browser automation more powerful, faster, and easier to use than ever before.
 
   ### 🚀 New Features
-
   - **Introducing `stagehand.agent`**: A powerful new way to integrate SOTA Computer use models or Browserbase's [Open Operator](https://operator.browserbase.com) into Stagehand with one line of code! Perfect for multi-step workflows and complex interactions. [Learn more](https://docs.stagehand.dev/concepts/agent)
   - **Lightning-fast `act` and `extract`**: Major performance improvements to make your automations run significantly faster.
   - **Enhanced Logging**: Better visibility into what's happening during automation with improved logging and debugging capabilities.
   - **Comprehensive Documentation**: A completely revamped documentation site with better examples, guides, and best practices.
   - **Improved Error Handling**: More descriptive errors and better error recovery to help you debug issues faster.
 
   ### 🛠️ Developer Experience
-
   - **Better TypeScript Support**: Enhanced type definitions and better IDE integration
   - **Better Error Messages**: Clearer, more actionable error messages to help you debug faster
   - **Improved Caching**: More reliable action caching for better performance
@@ -502,7 +500,6 @@
 - [#316](https://github.com/browserbase/stagehand/pull/316) [`902e633`](https://github.com/browserbase/stagehand/commit/902e633e126a58b80b757ea0ecada01a7675a473) Thanks [@kamath](https://github.com/kamath)! - rename browserbaseResumeSessionID -> browserbaseSessionID
 
 - [#296](https://github.com/browserbase/stagehand/pull/296) [`f11da27`](https://github.com/browserbase/stagehand/commit/f11da27a20409c240ceeea2003d520f676def61a) Thanks [@kamath](https://github.com/kamath)! - - Deprecate fields in `init` in favor of constructor options
-
   - Deprecate `initFromPage` in favor of `browserbaseResumeSessionID` in constructor
   - Rename `browserBaseSessionCreateParams` -> `browserbaseSessionCreateParams`
 
 
@@ -381,7 +381,6 @@ function handleRun(args: string[]): void {
         webbench: "agent/webbench",
         gaia: "agent/gaia",
         webvoyager: "agent/webvoyager",
-        osworld: "agent/osworld",
         onlineMind2Web: "agent/onlineMind2Web",
       };
 
 
@@ -117,7 +117,6 @@ export const initStagehand = async ({
   } else {
     agentConfig = {
       model: modelName,
-      executionModel: "google/gemini-2.5-flash",
     } as AgentConfig;
   }
 
 
@@ -1,7 +1,11 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
 import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
+import { loadApiKeyFromEnv } from "@/lib/utils";
+import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
+import dotenv from "dotenv";
 
+dotenv.config();
 /**
  * Data-driven GAIA agent eval
  * - Expects per-test params injected via eval runner: { id, level, web, ques }
@@ -15,25 +19,20 @@ export const gaia: EvalFunction = async ({
   debugUrl,
   sessionUrl,
   input,
-  agent,
+  modelName,
 }) => {
+  const startTime = Date.now();
+
   try {
     const params = ((input && input.params) || {}) as {
       id?: string;
       level?: number;
       web?: string;
       ques?: string;
+      expected?: string;
     };
 
     if (!params.web || !params.ques) {
-      logger.error({
-        category: "gaia",
-        level: 0,
-        message: `Missing GAIA params (web, ques).`,
-        auxiliary: {
-          params: { value: JSON.stringify(params), type: "object" },
-        },
-      });
       return {
         _success: false,
         error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`,
@@ -42,7 +41,24 @@ export const gaia: EvalFunction = async ({
         logs: logger.getLogs(),
       };
     }
-    await stagehand.page.goto(params.web);
+
+    await stagehand.page.goto(params.web, {
+      timeout: 75_000,
+    });
+
+    const provider =
+      modelName in modelToAgentProviderMap
+        ? modelToAgentProviderMap[modelName]
+        : undefined;
+
+    const agent = stagehand.agent({
+      model: modelName,
+      provider,
+      instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
+      options: {
+        apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
+      },
+    });
 
     // Start collecting screenshots with hybrid approach (10s intervals + agent triggers)
     const screenshotCollector = new ScreenshotCollector(stagehand.page, {
@@ -54,67 +70,45 @@ export const gaia: EvalFunction = async ({
       agent.setScreenshotCollector(screenshotCollector);
     }
 
-    let screenshots: Buffer[] = [];
-    let result;
+    screenshotCollector.start();
 
-    try {
-      screenshotCollector.start();
-
-      result = await agent.execute({
-        instruction: params.ques,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
-      });
-    } finally {
-      // Always stop collecting and get all screenshots, even on error
-      screenshots = screenshotCollector.stop();
-    }
+    const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
+    const agentResult = await agent.execute({
+      instruction: params.ques,
+      maxSteps,
+    });
+    // Stop collecting and get all screenshots
+    const screenshots = screenshotCollector.stop();
 
     logger.log({
       category: "evaluation",
       message: `Collected ${screenshots.length} screenshots for evaluation`,
       level: 1,
     });
 
-    const expected = (params as Record<string, unknown>).expected as
-      | string
-      | undefined;
+    const expected = params.expected;
     const evaluator = new Evaluator(stagehand);
     const evalResult = await evaluator.ask({
       question: `Did the agent provide the expected answer: "${expected}"?`,
-      answer: result?.message || "",
-      screenshot: false,
+      answer: agentResult.message || "",
+      screenshot: screenshots,
     });
 
     return {
       _success: evalResult.evaluation === "YES",
       reasoning: evalResult.reasoning,
       expectedAnswer: expected,
+      final_answer: agentResult?.message,
+      screenshotCount: screenshots.length,
+      task_level: params.level,
+      execution_time: Date.now() - startTime,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
   } catch (error) {
-    logger.error({
-      category: "gaia",
-      level: 0,
-      message: `Unhandled error in GAIA task`,
-      auxiliary: {
-        error: {
-          value: error instanceof Error ? error.message : String(error),
-          type: "string",
-        },
-        trace: {
-          value: error instanceof Error && error.stack ? error.stack : "",
-          type: "string",
-        },
-      },
-    });
-    return {
-      _success: false,
-      error,
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
+    // Let the error propagate - the parent runner will handle cleanup
+    console.error(error);
+    throw error;
   }
 };
@@ -6,14 +6,21 @@ import { loadApiKeyFromEnv } from "@/lib/utils";
 import dotenv from "dotenv";
 
 dotenv.config();
-
+/**
+ * Data-driven OnlineMind2Web agent eval
+ * - Expects per-test params injected via eval runner: { task_id, confirmed_task, website, reference_length, level }
+ * - Starts at `website`, runs the agent with `confirmed_task` as instruction
+ * - Requires the agent to output a final answer in the form: "Final Answer: <value>"
+ * - Marks success if such an answer string is present (exact matching against dataset can be layered later)
+ * - Uses the evaluator to determine if the agent successfully completed the task
+ */
 export const onlineMind2Web: EvalFunction = async ({
   stagehand,
   logger,
   debugUrl,
   sessionUrl,
-  modelName,
   input,
+  modelName,
 }) => {
   const startTime = Date.now();
 
@@ -66,10 +73,10 @@ export const onlineMind2Web: EvalFunction = async ({
 
     screenshotCollector.start();
 
-    const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 5;
+    const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
     const agentResult = await agent.execute({
       instruction: params.confirmed_task,
-      maxSteps: maxSteps,
+      maxSteps,
     });
 
     logger.log(agentResult);
 
@@ -101,51 +101,32 @@ export const osworld: EvalFunction = async ({
     let screenshots: Buffer[] = [];
     let result;
 
-    try {
-      screenshotCollector.start();
+    screenshotCollector.start();
 
-      // Execute the task using the pre-initialized agent with timeout
-      const executionPromise = agent.execute({
-        instruction: params.instruction,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
-      });
+    // Execute the task using the pre-initialized agent with timeout
+    const executionPromise = agent.execute({
+      instruction: params.instruction,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+    });
 
-      // Apply timeout wrapper
-      const timeoutPromise = new Promise((_, reject) =>
-        setTimeout(
-          () => reject(new Error(`Task timed out after ${timeout}ms`)),
-          timeout,
-        ),
-      );
-
-      result = await Promise.race([executionPromise, timeoutPromise]);
-    } finally {
-      // Always stop collecting and get all screenshots, even on error
-      screenshots = screenshotCollector.stop();
-    }
+    // Apply timeout wrapper
+    const timeoutPromise = new Promise((_, reject) =>
+      setTimeout(
+        () => reject(new Error(`Task timed out after ${timeout}ms`)),
+        timeout,
+      ),
+    );
+
+    result = await Promise.race([executionPromise, timeoutPromise]);
+    // Always stop collecting and get all screenshots, even on error
+    screenshots = screenshotCollector.stop();
 
     logger.log({
       category: "evaluation",
       message: `Collected ${screenshots.length} screenshots for evaluation`,
       level: 1,
     });
 
-    logger.log({
-      category: "osworld",
-      message: `Task ${params.id} execution completed`,
-      level: 1,
-      auxiliary: {
-        task_id: {
-          value: params.id,
-          type: "string",
-        },
-        has_result: {
-          value: (!!result).toString(),
-          type: "string",
-        },
-      },
-    });
-
     // Evaluate based on OSWorld evaluation type
     const success = await evaluateOSWorldTask(stagehand, params, logger);
 
 
@@ -10,6 +10,8 @@ export const webbench: EvalFunction = async ({
   input,
   agent,
 }) => {
+  const startTime = Date.now();
+
   try {
     const params = ((input && input.params) || {}) as {
       id?: string;
@@ -49,45 +51,15 @@ export const webbench: EvalFunction = async ({
       agent.setScreenshotCollector(screenshotCollector);
     }
 
-    let screenshots: Buffer[] = [];
-    let result;
-
-    try {
-      screenshotCollector.start();
-
-      logger.log({
-        category: "webbench",
-        message: `Starting WebBench task ${params.id}`,
-        level: 1,
-        auxiliary: {
-          category: {
-            value: params.category || "unknown",
-            type: "string",
-          },
-          difficulty: {
-            value: params.difficulty || "unknown",
-            type: "string",
-          },
-          url: {
-            value: params.url,
-            type: "string",
-          },
-          task_preview: {
-            value: params.task.substring(0, 100) + "...",
-            type: "string",
-          },
-        },
-      });
+    screenshotCollector.start();
 
-      // Execute the task using the pre-initialized agent
-      result = await agent.execute({
-        instruction: params.task,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
-      });
-    } finally {
-      // Always stop collecting and get all screenshots, even on error
-      screenshots = screenshotCollector.stop();
-    }
+    // Execute the task using the pre-initialized agent
+    const agentResult = await agent.execute({
+      instruction: params.task,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+    });
+    // Always stop collecting and get all screenshots, even on error
+    const screenshots = screenshotCollector.stop();
 
     logger.log({
       category: "evaluation",
@@ -106,7 +78,7 @@ export const webbench: EvalFunction = async ({
           type: "string",
         },
         has_result: {
-          value: (!!result).toString(),
+          value: (!!agentResult).toString(),
           type: "string",
         },
       },
@@ -136,7 +108,7 @@ export const webbench: EvalFunction = async ({
       question: evalPrompt,
       screenshot: screenshots,
       agentReasoning:
-        result?.message ||
+        agentResult.message ||
         "no reasoning available, agent potentially hit step limit",
     });
 
@@ -146,6 +118,9 @@ export const webbench: EvalFunction = async ({
       task_id: params.id,
       category: params.category,
       difficulty: params.difficulty || "unknown",
+      screenshotCount: screenshots.length,
+      final_answer: agentResult?.message,
+      execution_time: Date.now() - startTime,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,6 @@ export const initStagehand = async ({`
`117`	`117`	`} else {`
`118`	`118`	`agentConfig = {`
`119`	`119`	`model: modelName,`
`120`		`- executionModel: "google/gemini-2.5-flash",`
`121`	`120`	`} as AgentConfig;`
`122`	`121`	`}`
`123`	`122`