Skip to content

Commit

Permalink
Benchmarks update (#197)
Browse files Browse the repository at this point in the history
* Miniwob number of seeds 10 -> 5

* remove most benchmark variants

---------

Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
  • Loading branch information
ThibaultLSDC and gasse authored Oct 21, 2024
1 parent 32796ca commit 994ce59
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 66 deletions.
59 changes: 3 additions & 56 deletions browsergym/experiments/src/browsergym/experiments/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,26 +192,13 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})

# all benchmarks are callables designed for lazy loading, i.e. `bench = BENCHMARKS["miniwob_all"]()`
BENCHMARKS = {
"miniwob_all": lambda: Benchmark(
name="miniwob_all",
"miniwob": lambda: Benchmark(
name="miniwob",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("miniwob")),
max_steps=10,
n_repeats=10,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("miniwob"),
),
"miniwob_webgum": lambda: Benchmark(
name="miniwob_webgum",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("miniwob"), filter={"webgum_subset": "True"}
),
max_steps=10,
n_repeats=10,
n_repeats=5,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("miniwob"),
Expand All @@ -227,33 +214,6 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
),
task_metadata=task_metadata("miniwob"),
),
"miniwob_train": lambda: Benchmark(
name="miniwob_train",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("miniwob"),
filter={"miniwob_category": "original|nodelay|debug|additional"},
),
max_steps=10,
n_repeats=10,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("miniwob"),
),
"miniwob_test": lambda: Benchmark(
name="miniwob_test",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("miniwob"), filter={"miniwob_category": "hidden test"}
),
max_steps=10,
n_repeats=10,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("miniwob"),
),
"webarena": lambda: Benchmark(
name="webarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
Expand Down Expand Up @@ -289,19 +249,6 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
),
task_metadata=task_metadata("workarena"),
),
"workarena_l1_sort": lambda: Benchmark(
name="workarena_l1_sort",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena_l1"],
env_args_list=_make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("workarena"), filter={"level": "l1", "category": "list-sort"}
),
max_steps=15,
n_repeats=10,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("workarena"),
),
"workarena_l2_agent_curriculum_eval": lambda: Benchmark(
name="workarena_l2_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
Expand Down
16 changes: 6 additions & 10 deletions tests/experiments/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,11 @@ def make_agent(self):

def test_build_benchmarks():
expected_bench_size = {
"miniwob_all": 125 * 10,
"miniwob_webgum": 56 * 10,
"miniwob": 125 * 5,
"miniwob_tiny_test": 2 * 2,
"miniwob_train": 107 * 10,
"miniwob_test": 18 * 10,
"webarena": 812,
"visualwebarena": 910,
"workarena_l1": 33 * 10,
"workarena_l1_sort": 6 * 10,
"workarena_l2_agent_curriculum_eval": 235,
"workarena_l3_agent_curriculum_eval": 235,
}
Expand All @@ -74,11 +70,11 @@ def test_build_benchmarks():


def test_benchmark_subset():
benchmark: Benchmark = BENCHMARKS["miniwob_all"]()
benchmark: Benchmark = BENCHMARKS["miniwob"]()

benchmark_subset = benchmark.subset_from_regexp(column="task_name", regexp="click")
assert len(benchmark_subset.env_args_list) == 31 * 10
assert benchmark_subset.name == "miniwob_all[task_name=/click/]"
assert len(benchmark_subset.env_args_list) == 31 * 5
assert benchmark_subset.name == "miniwob[task_name=/click/]"

benchmark_subset_1 = benchmark_subset.subset_from_regexp(
column="miniwob_category", regexp="original"
Expand All @@ -87,8 +83,8 @@ def test_benchmark_subset():
column="miniwob_category", glob="original"
)

assert benchmark_subset_1.name == "miniwob_all[task_name=/click/][miniwob_category=/original/]"
assert benchmark_subset_2.name == "miniwob_all[task_name=/click/][miniwob_category=original]"
assert benchmark_subset_1.name == "miniwob[task_name=/click/][miniwob_category=/original/]"
assert benchmark_subset_2.name == "miniwob[task_name=/click/][miniwob_category=original]"

dict_1 = benchmark_subset_1.to_dict()
dict_1.pop("name")
Expand Down

0 comments on commit 994ce59

Please sign in to comment.