Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b8acbcf
img diff algo for screenshots
filip-michalsky Sep 13, 2025
28d9f65
intercept agent ss instead of time based trigger
filip-michalsky Sep 13, 2025
fd0985e
lint
filip-michalsky Sep 13, 2025
ef3947e
default 1 trial
filip-michalsky Sep 13, 2025
4ce7df8
image resize
filip-michalsky Sep 13, 2025
001e8d2
add changeset
filip-michalsky Sep 13, 2025
c89875e
merge main tip
filip-michalsky Sep 19, 2025
e76719f
Update evals/index.eval.ts
filip-michalsky Sep 19, 2025
14a185e
do NOT resize viewport, intercept ss by default
filip-michalsky Sep 19, 2025
9702884
combine time based screenshot service with an interceptor
filip-michalsky Sep 19, 2025
c255af5
add sharp as dep for evals
filip-michalsky Sep 20, 2025
cd2f4a9
adjust threshold to be less sensitive for screenshot UI changes
filip-michalsky Sep 20, 2025
2c40691
add screenshot collector to all external benchmarks
filip-michalsky Sep 20, 2025
814fccb
updates and refactor
miguelg719 Sep 22, 2025
783cf33
Update evals/evaluator.ts
miguelg719 Sep 22, 2025
9c5f69a
enable dom agent
miguelg719 Sep 22, 2025
be168ef
update benchmark runners
miguelg719 Sep 22, 2025
e1929bc
Apply suggestions from code review
miguelg719 Sep 22, 2025
6aa1229
cleanup
miguelg719 Sep 22, 2025
59e2c80
Merge branch 'main' into fm/str-798-improve-screenshots-in-evaluator
miguelg719 Sep 22, 2025
d029164
Merge branch 'main' into fm/str-798-improve-screenshots-in-evaluator
miguelg719 Sep 22, 2025
4d9ecf2
zod v3
miguelg719 Sep 22, 2025
ac4cb00
patch osworld
miguelg719 Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/curly-boats-push.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand-evals": patch
---

improve evals screenshot service - add img hashing diff to add screenshots and change to screenshot intercepts from the agent
1 change: 0 additions & 1 deletion evals/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,6 @@ function handleRun(args: string[]): void {
webbench: "agent/webbench",
gaia: "agent/gaia",
webvoyager: "agent/webvoyager",
osworld: "agent/osworld",
onlineMind2Web: "agent/onlineMind2Web",
};

Expand Down
28 changes: 24 additions & 4 deletions evals/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ import {
import { LLMParsedResponse } from "@/lib/inference";
import { LLMResponse } from "@/lib/llm/LLMClient";
import { LogLine } from "@/types/log";
import { z } from "zod";
import { z } from "zod/v3";
import { imageResize } from "./utils/imageUtils";

dotenv.config();

Expand Down Expand Up @@ -292,17 +293,36 @@ export class Evaluator {
this.modelClientOptions,
);

const imageContents = screenshots.map((screenshot) => ({
//Downsize screenshots:
const downsizedScreenshots = await Promise.all(
screenshots.map(async (screenshot) => {
return await imageResize(screenshot, 0.7);
}),
);

const imageContents = downsizedScreenshots.map((screenshot) => ({
type: "image_url" as const,
image_url: {
url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
url: `data:image/png;base64,${screenshot.toString("base64")}`,
},
}));

this.stagehand.logger?.({
category: "evaluator",
message: `Evaluating question: ${question} with ${screenshots.length} screenshots`,
level: 2,
auxiliary: {
images: {
value: JSON.stringify(imageContents),
type: "object",
},
},
});

const response = await llmClient.createChatCompletion<
LLMParsedResponse<LLMResponse>
>({
logger: this.silentLogger,
logger: this.stagehand.logger,
options: {
messages: [
{ role: "system", content: systemPrompt },
Expand Down
5 changes: 5 additions & 0 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ const generateFilteredTestcases = (): Testcase[] => {
const braintrustProjectName =
process.env.CI === "true" ? "stagehand" : "stagehand-dev";

const startTime = Date.now();

try {
// Run the evaluations with the braintrust Eval function
const evalResult = await Eval(braintrustProjectName, {
Expand Down Expand Up @@ -483,6 +485,9 @@ const generateFilteredTestcases = (): Testcase[] => {

// Generate and write the summary
await generateSummary(summaryResults, experimentName);
console.log(
`\n⌛️Evaluation completed in ${(Date.now() - startTime) / 1000}s\n`,
);
} catch (error) {
console.error("Error during evaluation run:", error);
process.exit(1);
Expand Down
1 change: 0 additions & 1 deletion evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ export const initStagehand = async ({
} else {
agentConfig = {
model: modelName,
executionModel: "google/gemini-2.5-flash",
} as AgentConfig;
}

Expand Down
3 changes: 2 additions & 1 deletion evals/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"e2e:local": "pnpm run build && playwright test --config deterministic/local.playwright.config.ts"
},
"dependencies": {
"@browserbasehq/stagehand": "workspace:*"
"@browserbasehq/stagehand": "workspace:*",
"sharp": "^0.33.5"
},
"devDependencies": {
"@types/papaparse": "^5.3.16",
Expand Down
99 changes: 60 additions & 39 deletions evals/tasks/agent/gaia.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import { EvalFunction } from "@/types/evals";
import { Evaluator } from "../../evaluator";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
import { loadApiKeyFromEnv } from "@/lib/utils";
import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
import dotenv from "dotenv";

dotenv.config();
/**
* Data-driven GAIA agent eval
* - Expects per-test params injected via eval runner: { id, level, web, ques }
Expand All @@ -14,25 +19,20 @@ export const gaia: EvalFunction = async ({
debugUrl,
sessionUrl,
input,
agent,
modelName,
}) => {
const startTime = Date.now();

try {
const params = ((input && input.params) || {}) as {
id?: string;
level?: number;
web?: string;
ques?: string;
expected?: string;
};

if (!params.web || !params.ques) {
logger.error({
category: "gaia",
level: 0,
message: `Missing GAIA params (web, ques).`,
auxiliary: {
params: { value: JSON.stringify(params), type: "object" },
},
});
return {
_success: false,
error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`,
Expand All @@ -41,53 +41,74 @@ export const gaia: EvalFunction = async ({
logs: logger.getLogs(),
};
}
await stagehand.page.goto(params.web);

const result = await agent.execute({
await stagehand.page.goto(params.web, {
timeout: 75_000,
});

const provider =
modelName in modelToAgentProviderMap
? modelToAgentProviderMap[modelName]
: undefined;

const agent = stagehand.agent({
model: modelName,
provider,
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
options: {
apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
},
});

// Start collecting screenshots with hybrid approach
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
maxScreenshots: 8, // Keep last 8 screenshots
});

// Set the collector on the agent so it captures screenshots
if (agent.setScreenshotCollector) {
agent.setScreenshotCollector(screenshotCollector);
}

screenshotCollector.start();

const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
const agentResult = await agent.execute({
instruction: params.ques,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
maxSteps,
});
// Stop collecting and get all screenshots
const screenshots = screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});

const expected = (params as Record<string, unknown>).expected as
| string
| undefined;
const expected = params.expected;
const evaluator = new Evaluator(stagehand);
const evalResult = await evaluator.ask({
question: `Did the agent provide the expected answer: "${expected}"?`,
answer: result?.message || "",
screenshot: false,
answer: agentResult.message || "",
screenshot: screenshots,
});

return {
_success: evalResult.evaluation === "YES",
reasoning: evalResult.reasoning,
expectedAnswer: expected,
final_answer: agentResult?.message,
screenshotCount: screenshots.length,
task_level: params.level,
execution_time: Date.now() - startTime,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
logger.error({
category: "gaia",
level: 0,
message: `Unhandled error in GAIA task`,
auxiliary: {
error: {
value: error instanceof Error ? error.message : String(error),
type: "string",
},
trace: {
value: error instanceof Error && error.stack ? error.stack : "",
type: "string",
},
},
});
return {
_success: false,
error,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
// Let the error propagate - the parent runner will handle cleanup
console.error(error);
throw error;
}
};
2 changes: 1 addition & 1 deletion evals/tasks/agent/google_maps_2.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { EvalFunction } from "@/types/evals";
import { Evaluator } from "../../evaluator";
import { z } from "zod";
import { z } from "zod/v3";

export const google_maps_2: EvalFunction = async ({
debugUrl,
Expand Down
62 changes: 44 additions & 18 deletions evals/tasks/agent/onlineMind2Web.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
import { EvalFunction } from "@/types/evals";
import { Evaluator } from "../../evaluator";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
import { loadApiKeyFromEnv } from "@/lib/utils";
import dotenv from "dotenv";
import fs from "fs";
dotenv.config();

dotenv.config();
/**
* Data-driven OnlineMind2Web agent eval
* - Expects per-test params injected via eval runner: { task_id, confirmed_task, website, reference_length, level }
* - Starts at `website`, runs the agent with `confirmed_task` as instruction
* - Requires the agent to output a final answer in the form: "Final Answer: <value>"
* - Marks success if such an answer string is present (exact matching against dataset can be layered later)
* - Uses the evaluator to determine if the agent successfully completed the task
*/
export const onlineMind2Web: EvalFunction = async ({
stagehand,
logger,
debugUrl,
sessionUrl,
input,
agent,
modelName,
}) => {
const startTime = Date.now();

try {
const params = ((input && input.params) || {}) as {
task_id?: string;
Expand All @@ -33,25 +44,42 @@ export const onlineMind2Web: EvalFunction = async ({
}

await stagehand.page.goto(params.website, {
timeout: 60_000,
timeout: 75_000,
});

const screenshot = await stagehand.page.screenshot();
fs.writeFileSync("screenshot.png", screenshot);
const provider =
modelName in modelToAgentProviderMap
? modelToAgentProviderMap[modelName]
: undefined;

const agent = stagehand.agent({
model: modelName,
provider,
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
options: {
apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
},
});

// Start collecting screenshots in parallel
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
maxScreenshots: 5, // Keep up to the last 5 screenshots
captureOnNavigation: true, // Also capture on page navigation
maxScreenshots: 8, // Keep up to the last 8 screenshots
});

// Set the collector on the agent so it captures screenshots
if (agent.setScreenshotCollector) {
agent.setScreenshotCollector(screenshotCollector);
}

screenshotCollector.start();

const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
const agentResult = await agent.execute({
instruction: params.confirmed_task,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
maxSteps,
});

logger.log(agentResult);
// Stop collecting and get all screenshots
const screenshots = screenshotCollector.stop();

Expand All @@ -63,7 +91,7 @@ export const onlineMind2Web: EvalFunction = async ({

const evaluator = new Evaluator(stagehand);
const evalResult = await evaluator.ask({
question: `Did the agent successfully complete this task: "${params.confirmed_task}"?`,
question: `Did the agent successfully complete this task: "${params.confirmed_task}"? The task might be a bit outdated or impossible to complete, in those cases lean towards YES.`,
screenshot: screenshots,
agentReasoning:
agentResult.message ||
Expand All @@ -73,19 +101,17 @@ export const onlineMind2Web: EvalFunction = async ({
return {
_success: evalResult.evaluation === "YES",
reasoning: evalResult.reasoning,
// screenshotCount: screenshots.length,
final_answer: agentResult?.message,
screenshotCount: screenshots.length,
task_level: params.level,
execution_time: Date.now() - startTime,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
error,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
// Let the error propagate - the parent runner will handle cleanup
console.error(error);
throw error;
}
};
Loading