From 2d1be91b66182673156de9f7d8dcfbca17f62a5c Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Thu, 21 Nov 2024 12:27:41 -0500 Subject: [PATCH 1/2] visualwebarena_tiny benchmark --- .../experiments/benchmark/configs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index b6926557..c9c5994e 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -153,6 +153,24 @@ ), task_metadata=task_metadata("webarena"), ), + "visualwebarena_tiny": lambda: Benchmark( + name="visualwebarena_tiny", + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"], + is_multi_tab=True, + supports_parallel_seeds=False, + backends=["visualwebarena"], + env_args_list=make_env_args_list_from_fixed_seeds( + task_list=[ + "visualwebarena.228", + "visualwebarena.263", + "visualwebarena.550", + "visualwebarena.784", + ], + max_steps=30, + fixed_seeds=[0], + ), + task_metadata=task_metadata("visualwebarena"), + ), "visualwebarena": lambda: Benchmark( name="visualwebarena", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"], From fe3ece218cd20810d852c26e6079c3de04c8b01a Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Thu, 21 Nov 2024 14:50:13 -0500 Subject: [PATCH 2/2] test fix --- tests/experiments/test_benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/experiments/test_benchmark.py b/tests/experiments/test_benchmark.py index f43a661b..02f6f62b 100644 --- a/tests/experiments/test_benchmark.py +++ b/tests/experiments/test_benchmark.py @@ -50,6 +50,7 @@ def test_build_benchmarks(): "webarena": 812, "webarena_tiny": 6, "visualwebarena": 910, + "visualwebarena_tiny": 4, "workarena_l1": 33 * 10, "workarena_l2_agent_curriculum_eval": 235, "workarena_l3_agent_curriculum_eval": 235,