Skip to content

Commit 9c5f69a

Browse files
committed
enable dom agent
1 parent 783cf33 commit 9c5f69a

File tree

3 files changed

+6
-18
lines changed

3 files changed

+6
-18
lines changed

.env.example

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,3 @@ EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview"
1111
EVAL_CATEGORIES="observe,act,combination,extract,experimental"
1212
AGENT_EVAL_MAX_STEPS=50
1313
STAGEHAND_API_URL="http://localhost:80"
14-
15-
# Screenshot collector thresholds for determining when screenshots are too similar
16-
# SSIM (Structural Similarity Index) threshold: 0-1, higher = more similar (default: 0.85)
17-
SCREENSHOT_SSIM_THRESHOLD=0.85
18-
# MSE (Mean Squared Error) threshold: lower = more similar (default: 200)
19-
SCREENSHOT_MSE_THRESHOLD=200

evals/taskConfig.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
109109
: [
110110
"computer-use-preview-2025-03-11",
111111
"claude-sonnet-4-20250514",
112-
// "anthropic/claude-sonnet-4-20250514",
112+
"anthropic/claude-sonnet-4-20250514",
113113
];
114114

115115
/**

evals/tasks/agent/onlineMind2Web.ts

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,11 @@ export const onlineMind2Web: EvalFunction = async ({
4040
timeout: 75_000,
4141
});
4242

43-
if (!(modelName in modelToAgentProviderMap)) {
44-
return {
45-
_success: false,
46-
error: `Model ${modelName} is not supported for agent tasks. Supported models: ${Object.keys(modelToAgentProviderMap).join(", ")}`,
47-
debugUrl,
48-
sessionUrl,
49-
logs: logger.getLogs(),
50-
};
51-
}
43+
const provider =
44+
modelName in modelToAgentProviderMap
45+
? modelToAgentProviderMap[modelName]
46+
: undefined;
5247

53-
const provider = modelToAgentProviderMap[modelName];
5448
const agent = stagehand.agent({
5549
model: modelName,
5650
provider,
@@ -72,7 +66,7 @@ export const onlineMind2Web: EvalFunction = async ({
7266

7367
screenshotCollector.start();
7468

75-
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
69+
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 5;
7670
const agentResult = await agent.execute({
7771
instruction: params.confirmed_task,
7872
maxSteps: maxSteps,

0 commit comments

Comments
 (0)