Skip to content

Commit 691334d

Browse files
committed
clean benchmarks
1 parent 9c5f69a commit 691334d

20 files changed

+3705
-148
lines changed

CHANGELOG.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,15 +233,13 @@
233233
We're thrilled to announce the release of Stagehand 2.0, bringing significant improvements to make browser automation more powerful, faster, and easier to use than ever before.
234234

235235
### 🚀 New Features
236-
237236
- **Introducing `stagehand.agent`**: A powerful new way to integrate SOTA Computer use models or Browserbase's [Open Operator](https://operator.browserbase.com) into Stagehand with one line of code! Perfect for multi-step workflows and complex interactions. [Learn more](https://docs.stagehand.dev/concepts/agent)
238237
- **Lightning-fast `act` and `extract`**: Major performance improvements to make your automations run significantly faster.
239238
- **Enhanced Logging**: Better visibility into what's happening during automation with improved logging and debugging capabilities.
240239
- **Comprehensive Documentation**: A completely revamped documentation site with better examples, guides, and best practices.
241240
- **Improved Error Handling**: More descriptive errors and better error recovery to help you debug issues faster.
242241

243242
### 🛠️ Developer Experience
244-
245243
- **Better TypeScript Support**: Enhanced type definitions and better IDE integration
246244
- **Better Error Messages**: Clearer, more actionable error messages to help you debug faster
247245
- **Improved Caching**: More reliable action caching for better performance
@@ -502,7 +500,6 @@
502500
- [#316](https://github.com/browserbase/stagehand/pull/316) [`902e633`](https://github.com/browserbase/stagehand/commit/902e633e126a58b80b757ea0ecada01a7675a473) Thanks [@kamath](https://github.com/kamath)! - rename browserbaseResumeSessionID -> browserbaseSessionID
503501

504502
- [#296](https://github.com/browserbase/stagehand/pull/296) [`f11da27`](https://github.com/browserbase/stagehand/commit/f11da27a20409c240ceeea2003d520f676def61a) Thanks [@kamath](https://github.com/kamath)! - - Deprecate fields in `init` in favor of constructor options
505-
506503
- Deprecate `initFromPage` in favor of `browserbaseResumeSessionID` in constructor
507504
- Rename `browserBaseSessionCreateParams` -> `browserbaseSessionCreateParams`
508505

evals/cli.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,6 @@ function handleRun(args: string[]): void {
381381
webbench: "agent/webbench",
382382
gaia: "agent/gaia",
383383
webvoyager: "agent/webvoyager",
384-
osworld: "agent/osworld",
385384
onlineMind2Web: "agent/onlineMind2Web",
386385
};
387386

evals/initStagehand.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ export const initStagehand = async ({
117117
} else {
118118
agentConfig = {
119119
model: modelName,
120-
executionModel: "google/gemini-2.5-flash",
121120
} as AgentConfig;
122121
}
123122

evals/tasks/agent/gaia.ts

Lines changed: 44 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
33
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
import { loadApiKeyFromEnv } from "@/lib/utils";
5+
import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
6+
import dotenv from "dotenv";
47

8+
dotenv.config();
59
/**
610
* Data-driven GAIA agent eval
711
* - Expects per-test params injected via eval runner: { id, level, web, ques }
@@ -15,25 +19,20 @@ export const gaia: EvalFunction = async ({
1519
debugUrl,
1620
sessionUrl,
1721
input,
18-
agent,
22+
modelName,
1923
}) => {
24+
const startTime = Date.now();
25+
2026
try {
2127
const params = ((input && input.params) || {}) as {
2228
id?: string;
2329
level?: number;
2430
web?: string;
2531
ques?: string;
32+
expected?: string;
2633
};
2734

2835
if (!params.web || !params.ques) {
29-
logger.error({
30-
category: "gaia",
31-
level: 0,
32-
message: `Missing GAIA params (web, ques).`,
33-
auxiliary: {
34-
params: { value: JSON.stringify(params), type: "object" },
35-
},
36-
});
3736
return {
3837
_success: false,
3938
error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`,
@@ -42,7 +41,24 @@ export const gaia: EvalFunction = async ({
4241
logs: logger.getLogs(),
4342
};
4443
}
45-
await stagehand.page.goto(params.web);
44+
45+
await stagehand.page.goto(params.web, {
46+
timeout: 75_000,
47+
});
48+
49+
const provider =
50+
modelName in modelToAgentProviderMap
51+
? modelToAgentProviderMap[modelName]
52+
: undefined;
53+
54+
const agent = stagehand.agent({
55+
model: modelName,
56+
provider,
57+
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
58+
options: {
59+
apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
60+
},
61+
});
4662

4763
// Start collecting screenshots with hybrid approach (10s intervals + agent triggers)
4864
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
@@ -54,67 +70,45 @@ export const gaia: EvalFunction = async ({
5470
agent.setScreenshotCollector(screenshotCollector);
5571
}
5672

57-
let screenshots: Buffer[] = [];
58-
let result;
73+
screenshotCollector.start();
5974

60-
try {
61-
screenshotCollector.start();
62-
63-
result = await agent.execute({
64-
instruction: params.ques,
65-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
66-
});
67-
} finally {
68-
// Always stop collecting and get all screenshots, even on error
69-
screenshots = screenshotCollector.stop();
70-
}
75+
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
76+
const agentResult = await agent.execute({
77+
instruction: params.ques,
78+
maxSteps,
79+
});
80+
// Stop collecting and get all screenshots
81+
const screenshots = screenshotCollector.stop();
7182

7283
logger.log({
7384
category: "evaluation",
7485
message: `Collected ${screenshots.length} screenshots for evaluation`,
7586
level: 1,
7687
});
7788

78-
const expected = (params as Record<string, unknown>).expected as
79-
| string
80-
| undefined;
89+
const expected = params.expected;
8190
const evaluator = new Evaluator(stagehand);
8291
const evalResult = await evaluator.ask({
8392
question: `Did the agent provide the expected answer: "${expected}"?`,
84-
answer: result?.message || "",
85-
screenshot: false,
93+
answer: agentResult.message || "",
94+
screenshot: screenshots,
8695
});
8796

8897
return {
8998
_success: evalResult.evaluation === "YES",
9099
reasoning: evalResult.reasoning,
91100
expectedAnswer: expected,
101+
final_answer: agentResult?.message,
102+
screenshotCount: screenshots.length,
103+
task_level: params.level,
104+
execution_time: Date.now() - startTime,
92105
debugUrl,
93106
sessionUrl,
94107
logs: logger.getLogs(),
95108
};
96109
} catch (error) {
97-
logger.error({
98-
category: "gaia",
99-
level: 0,
100-
message: `Unhandled error in GAIA task`,
101-
auxiliary: {
102-
error: {
103-
value: error instanceof Error ? error.message : String(error),
104-
type: "string",
105-
},
106-
trace: {
107-
value: error instanceof Error && error.stack ? error.stack : "",
108-
type: "string",
109-
},
110-
},
111-
});
112-
return {
113-
_success: false,
114-
error,
115-
debugUrl,
116-
sessionUrl,
117-
logs: logger.getLogs(),
118-
};
110+
// Let the error propagate - the parent runner will handle cleanup
111+
console.error(error);
112+
throw error;
119113
}
120114
};

evals/tasks/agent/onlineMind2Web.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,21 @@ import { loadApiKeyFromEnv } from "@/lib/utils";
66
import dotenv from "dotenv";
77

88
dotenv.config();
9-
9+
/**
10+
* Data-driven OnlineMind2Web agent eval
11+
* - Expects per-test params injected via eval runner: { task_id, confirmed_task, website, reference_length, level }
12+
* - Starts at `website`, runs the agent with `confirmed_task` as instruction
13+
* - Requires the agent to output a final answer in the form: "Final Answer: <value>"
14+
* - Marks success if such an answer string is present (exact matching against dataset can be layered later)
15+
* - Uses the evaluator to determine if the agent successfully completed the task
16+
*/
1017
export const onlineMind2Web: EvalFunction = async ({
1118
stagehand,
1219
logger,
1320
debugUrl,
1421
sessionUrl,
15-
modelName,
1622
input,
23+
modelName,
1724
}) => {
1825
const startTime = Date.now();
1926

@@ -66,10 +73,10 @@ export const onlineMind2Web: EvalFunction = async ({
6673

6774
screenshotCollector.start();
6875

69-
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 5;
76+
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
7077
const agentResult = await agent.execute({
7178
instruction: params.confirmed_task,
72-
maxSteps: maxSteps,
79+
maxSteps,
7380
});
7481

7582
logger.log(agentResult);

evals/tasks/agent/osworld.ts

Lines changed: 17 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -101,51 +101,32 @@ export const osworld: EvalFunction = async ({
101101
let screenshots: Buffer[] = [];
102102
let result;
103103

104-
try {
105-
screenshotCollector.start();
104+
screenshotCollector.start();
106105

107-
// Execute the task using the pre-initialized agent with timeout
108-
const executionPromise = agent.execute({
109-
instruction: params.instruction,
110-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
111-
});
106+
// Execute the task using the pre-initialized agent with timeout
107+
const executionPromise = agent.execute({
108+
instruction: params.instruction,
109+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
110+
});
112111

113-
// Apply timeout wrapper
114-
const timeoutPromise = new Promise((_, reject) =>
115-
setTimeout(
116-
() => reject(new Error(`Task timed out after ${timeout}ms`)),
117-
timeout,
118-
),
119-
);
120-
121-
result = await Promise.race([executionPromise, timeoutPromise]);
122-
} finally {
123-
// Always stop collecting and get all screenshots, even on error
124-
screenshots = screenshotCollector.stop();
125-
}
112+
// Apply timeout wrapper
113+
const timeoutPromise = new Promise((_, reject) =>
114+
setTimeout(
115+
() => reject(new Error(`Task timed out after ${timeout}ms`)),
116+
timeout,
117+
),
118+
);
119+
120+
result = await Promise.race([executionPromise, timeoutPromise]);
121+
// Always stop collecting and get all screenshots, even on error
122+
screenshots = screenshotCollector.stop();
126123

127124
logger.log({
128125
category: "evaluation",
129126
message: `Collected ${screenshots.length} screenshots for evaluation`,
130127
level: 1,
131128
});
132129

133-
logger.log({
134-
category: "osworld",
135-
message: `Task ${params.id} execution completed`,
136-
level: 1,
137-
auxiliary: {
138-
task_id: {
139-
value: params.id,
140-
type: "string",
141-
},
142-
has_result: {
143-
value: (!!result).toString(),
144-
type: "string",
145-
},
146-
},
147-
});
148-
149130
// Evaluate based on OSWorld evaluation type
150131
const success = await evaluateOSWorldTask(stagehand, params, logger);
151132

evals/tasks/agent/webbench.ts

Lines changed: 15 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ export const webbench: EvalFunction = async ({
1010
input,
1111
agent,
1212
}) => {
13+
const startTime = Date.now();
14+
1315
try {
1416
const params = ((input && input.params) || {}) as {
1517
id?: string;
@@ -49,45 +51,15 @@ export const webbench: EvalFunction = async ({
4951
agent.setScreenshotCollector(screenshotCollector);
5052
}
5153

52-
let screenshots: Buffer[] = [];
53-
let result;
54-
55-
try {
56-
screenshotCollector.start();
57-
58-
logger.log({
59-
category: "webbench",
60-
message: `Starting WebBench task ${params.id}`,
61-
level: 1,
62-
auxiliary: {
63-
category: {
64-
value: params.category || "unknown",
65-
type: "string",
66-
},
67-
difficulty: {
68-
value: params.difficulty || "unknown",
69-
type: "string",
70-
},
71-
url: {
72-
value: params.url,
73-
type: "string",
74-
},
75-
task_preview: {
76-
value: params.task.substring(0, 100) + "...",
77-
type: "string",
78-
},
79-
},
80-
});
54+
screenshotCollector.start();
8155

82-
// Execute the task using the pre-initialized agent
83-
result = await agent.execute({
84-
instruction: params.task,
85-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
86-
});
87-
} finally {
88-
// Always stop collecting and get all screenshots, even on error
89-
screenshots = screenshotCollector.stop();
90-
}
56+
// Execute the task using the pre-initialized agent
57+
const agentResult = await agent.execute({
58+
instruction: params.task,
59+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
60+
});
61+
// Always stop collecting and get all screenshots, even on error
62+
const screenshots = screenshotCollector.stop();
9163

9264
logger.log({
9365
category: "evaluation",
@@ -106,7 +78,7 @@ export const webbench: EvalFunction = async ({
10678
type: "string",
10779
},
10880
has_result: {
109-
value: (!!result).toString(),
81+
value: (!!agentResult).toString(),
11082
type: "string",
11183
},
11284
},
@@ -136,7 +108,7 @@ export const webbench: EvalFunction = async ({
136108
question: evalPrompt,
137109
screenshot: screenshots,
138110
agentReasoning:
139-
result?.message ||
111+
agentResult.message ||
140112
"no reasoning available, agent potentially hit step limit",
141113
});
142114

@@ -146,6 +118,9 @@ export const webbench: EvalFunction = async ({
146118
task_id: params.id,
147119
category: params.category,
148120
difficulty: params.difficulty || "unknown",
121+
screenshotCount: screenshots.length,
122+
final_answer: agentResult?.message,
123+
execution_time: Date.now() - startTime,
149124
debugUrl,
150125
sessionUrl,
151126
logs: logger.getLogs(),

0 commit comments

Comments
 (0)