From fe219bb076a1b9725e555d88b9b6c74c97046163 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Wed, 23 Oct 2024 10:35:01 -0400 Subject: [PATCH] becnhmark code split refactor --- browsergym/experiments/src/bgym/__init__.py | 6 +- .../experiments/benchmark/__init__.py | 1 + .../{benchmark.py => benchmark/base.py} | 120 ++++-------------- .../metadata}/assistantbench.csv | 0 .../metadata}/miniwob.csv | 0 .../metadata}/scripts.py | 0 .../experiments/benchmark/metadata/utils.py | 24 ++++ .../metadata}/visualwebarena.csv | 0 .../metadata}/webarena.csv | 0 .../metadata}/weblinx.csv | 0 .../metadata}/workarena.csv | 0 .../browsergym/experiments/benchmark/utils.py | 95 ++++++++++++++ tests/assistantbench/test_evaluation.py | 2 +- tests/experiments/test_benchmark.py | 20 +-- 14 files changed, 159 insertions(+), 109 deletions(-) create mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py rename browsergym/experiments/src/browsergym/experiments/{benchmark.py => benchmark/base.py} (75%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/assistantbench.csv (100%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/miniwob.csv (100%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/scripts.py (100%) create mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/visualwebarena.csv (100%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/webarena.csv (100%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/weblinx.csv (100%) rename browsergym/experiments/src/browsergym/experiments/{task_metadata => benchmark/metadata}/workarena.csv (100%) create mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/utils.py diff --git a/browsergym/experiments/src/bgym/__init__.py b/browsergym/experiments/src/bgym/__init__.py index 10adf6f9..c43f505f 100644 --- a/browsergym/experiments/src/bgym/__init__.py +++ b/browsergym/experiments/src/bgym/__init__.py @@ -2,7 +2,11 @@ from browsergym.core.action.highlevel import HighLevelActionSet from browsergym.core.action.python import PythonActionSet from browsergym.experiments.agent import Agent, AgentInfo -from browsergym.experiments.benchmark import Benchmark, HighLevelActionSetArgs, BENCHMARKS +from browsergym.experiments.benchmark import ( + DEFAULT_BENCHMARKS, + Benchmark, + HighLevelActionSetArgs, +) from browsergym.experiments.loop import ( AbstractAgentArgs, EnvArgs, diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py b/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py new file mode 100644 index 00000000..e00ad594 --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py @@ -0,0 +1 @@ +from .base import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py similarity index 75% rename from browsergym/experiments/src/browsergym/experiments/benchmark.py rename to browsergym/experiments/src/browsergym/experiments/benchmark/base.py index 94160359..f59a90b3 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py @@ -1,7 +1,5 @@ import fnmatch -import io import logging -import pkgutil from dataclasses import dataclass, field from typing import Literal, Optional @@ -10,7 +8,13 @@ from dataclasses_json import DataClassJsonMixin, config from browsergym.core.action.highlevel import HighLevelActionSet -from browsergym.experiments.loop import SEED_MAX, EnvArgs +from browsergym.experiments.loop import EnvArgs + +from .metadata.utils import task_list_from_metadata, task_metadata +from .utils import ( + make_env_args_list_from_repeat_tasks, + make_env_args_list_from_workarena_curriculum, +) logger = logging.getLogger(__name__) @@ -49,6 +53,7 @@ def make_action_set(self): class Benchmark(DataClassJsonMixin): name: str high_level_action_set_args: HighLevelActionSetArgs + is_multi_tab: bool env_args_list: list[EnvArgs] task_metadata: Optional[pd.DataFrame] = field( default_factory=lambda: None, @@ -109,28 +114,6 @@ def subset_from_regexp(self, column, regexp): ) -def task_metadata(benchmark_name: str): - return task_metadata_from_csv( - io.StringIO( - pkgutil.get_data(__name__, f"task_metadata/{benchmark_name}.csv").decode("utf-8") - ) - ) - - -def task_metadata_from_csv(filepath): - return pd.read_csv(filepath).fillna("") - - -def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}): - df = metadata - # filter the desired columns (AND filter) - for col_name, regex in filter.items(): - col_filter = df[col_name].astype(str).str.contains(regex, regex=True) - df = df[col_filter] - # return only the task names - return list(df["task_name"]) - - # These are mean as the default highlevel action set to fairly evaluate agents on each benchmark. # They are mostly arbitrary, the important thing is to evaluate different agents using the same action set for fairness. DEFAULT_HIGHLEVEL_ACTION_SET_ARGS = { @@ -197,12 +180,13 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) ), } -# all benchmarks are callables designed for lazy loading, i.e. `bench = BENCHMARKS["miniwob_all"]()` -BENCHMARKS = { +# all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()` +DEFAULT_BENCHMARKS = { "miniwob": lambda: Benchmark( name="miniwob", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"], - env_args_list=_make_env_args_list_from_repeat_tasks( + is_multi_tab=False, + env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("miniwob")), max_steps=10, n_repeats=5, @@ -213,7 +197,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "miniwob_tiny_test": lambda: Benchmark( name="miniwob_tiny_test", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"], - env_args_list=_make_env_args_list_from_repeat_tasks( + is_multi_tab=False, + env_args_list=make_env_args_list_from_repeat_tasks( task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"], max_steps=5, n_repeats=2, @@ -224,7 +209,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "webarena": lambda: Benchmark( name="webarena", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], - env_args_list=_make_env_args_list_from_repeat_tasks( + is_multi_tab=True, + env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("webarena")), max_steps=15, n_repeats=1, @@ -235,7 +221,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "visualwebarena": lambda: Benchmark( name="visualwebarena", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"], - env_args_list=_make_env_args_list_from_repeat_tasks( + is_multi_tab=True, + env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("visualwebarena")), max_steps=15, n_repeats=1, @@ -246,7 +233,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "workarena_l1": lambda: Benchmark( name="workarena_l1", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena_l1"], - env_args_list=_make_env_args_list_from_workarena_curriculum( + is_multi_tab=False, + env_args_list=make_env_args_list_from_workarena_curriculum( level="l1", task_category_filter=None, meta_seed=42, # meta seed for evaluation curriculum @@ -259,7 +247,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "workarena_l2_agent_curriculum_eval": lambda: Benchmark( name="workarena_l2_agent_curriculum_eval", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], - env_args_list=_make_env_args_list_from_workarena_curriculum( + is_multi_tab=True, + env_args_list=make_env_args_list_from_workarena_curriculum( level="l2", task_category_filter=None, meta_seed=42, # meta seed for evaluation curriculum @@ -271,7 +260,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "workarena_l3_agent_curriculum_eval": lambda: Benchmark( name="workarena_l3_agent_curriculum_eval", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], - env_args_list=_make_env_args_list_from_workarena_curriculum( + is_multi_tab=True, + env_args_list=make_env_args_list_from_workarena_curriculum( level="l3", task_category_filter=None, meta_seed=42, # meta seed for evaluation curriculum @@ -283,7 +273,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) "assistantbench": lambda: Benchmark( name="assistantbench", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["assistantbench"], - env_args_list=_make_env_args_list_from_repeat_tasks( + is_multi_tab=True, + env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata( metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"} ), @@ -294,62 +285,3 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}) task_metadata=task_metadata("assistantbench"), ), } - - -def _make_env_args_list_from_workarena_curriculum( - level: Literal["l1", "l2", "l3"], - task_category_filter: str, - meta_seed: int, - max_steps: int, - curriculum_type: Literal["human", "agent"], - seeds_l1: int = 10, -): - """ - Returns a WorkArena predefined task curriculum (e.g., task and seed combination). - """ - assert level in ("l1", "l2", "l3") - assert curriculum_type in ("human", "agent") - - env_args_list = [] - - from browsergym.workarena import get_all_tasks_agents - - all_task_tuples = get_all_tasks_agents( - filter=f"{level}.{task_category_filter}" if task_category_filter else level, - meta_seed=meta_seed, - is_agent_curriculum=(curriculum_type == "agent"), - n_seed_l1=seeds_l1, - ) - - for task, seed in all_task_tuples: - task_name = task.get_task_id() - env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps)) - - return env_args_list - - -def _make_env_args_list_from_repeat_tasks( - task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState -): - """ - Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds. - """ - env_args_list = [] - for task in task_list: - for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats): - env_args_list.append( - EnvArgs( - task_name=task, - task_seed=int(seed), - max_steps=max_steps, - headless=True, - record_video=False, - wait_for_user_message=False, - viewport=None, - slow_mo=None, - storage_state=None, - task_kwargs=None, - ) - ) - - return env_args_list diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/assistantbench.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/assistantbench.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/assistantbench.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/assistantbench.csv diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/miniwob.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/miniwob.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/miniwob.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/miniwob.csv diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/scripts.py b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/scripts.py similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/scripts.py rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/scripts.py diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py new file mode 100644 index 00000000..941896ed --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py @@ -0,0 +1,24 @@ +import io +import pkgutil + +import pandas as pd + + +def task_metadata(benchmark_name: str): + return task_metadata_from_csv( + io.StringIO(pkgutil.get_data(__name__, f"{benchmark_name}.csv").decode("utf-8")) + ) + + +def task_metadata_from_csv(filepath): + return pd.read_csv(filepath).fillna("") + + +def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}): + df = metadata + # filter the desired columns (AND filter) + for col_name, regex in filter.items(): + col_filter = df[col_name].astype(str).str.contains(regex, regex=True) + df = df[col_filter] + # return only the task names + return list(df["task_name"]) diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/visualwebarena.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/visualwebarena.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/visualwebarena.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/visualwebarena.csv diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/webarena.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/webarena.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena.csv diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/weblinx.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/weblinx.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/weblinx.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/weblinx.csv diff --git a/browsergym/experiments/src/browsergym/experiments/task_metadata/workarena.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/workarena.csv similarity index 100% rename from browsergym/experiments/src/browsergym/experiments/task_metadata/workarena.csv rename to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/workarena.csv diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py new file mode 100644 index 00000000..cad55373 --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -0,0 +1,95 @@ +import io +import pkgutil +from typing import Literal + +import numpy as np +import pandas as pd + +from browsergym.experiments.loop import SEED_MAX, EnvArgs + + +def make_env_args_list_from_workarena_curriculum( + level: Literal["l1", "l2", "l3"], + task_category_filter: str, + meta_seed: int, + max_steps: int, + curriculum_type: Literal["human", "agent"], + seeds_l1: int = 10, +): + """ + Returns a WorkArena predefined task curriculum (e.g., task and seed combination). + """ + assert level in ("l1", "l2", "l3") + assert curriculum_type in ("human", "agent") + + env_args_list = [] + + # dynamic import + from browsergym.workarena import get_all_tasks_agents + + all_task_tuples = get_all_tasks_agents( + filter=f"{level}.{task_category_filter}" if task_category_filter else level, + meta_seed=meta_seed, + is_agent_curriculum=(curriculum_type == "agent"), + n_seed_l1=seeds_l1, + ) + + for task, seed in all_task_tuples: + task_name = task.get_task_id() + env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps)) + + return env_args_list + + +def make_env_args_list_from_repeat_tasks( + task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState +): + """ + Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds. + """ + env_args_list = [] + for task in task_list: + for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats): + env_args_list.append( + EnvArgs( + task_name=task, + task_seed=int(seed), + max_steps=max_steps, + headless=True, + record_video=False, + wait_for_user_message=False, + viewport=None, + slow_mo=None, + storage_state=None, + task_kwargs=None, + ) + ) + + return env_args_list + + +def make_env_args_list_from_fixed_seeds( + task_list: list[str], max_steps: int, fixed_seeds: list[int] +): + """ + Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds. + """ + env_args_list = [] + for task in task_list: + for seed in fixed_seeds: + env_args_list.append( + EnvArgs( + task_name=task, + task_seed=int(seed), + max_steps=max_steps, + headless=True, + record_video=False, + wait_for_user_message=False, + viewport=None, + slow_mo=None, + storage_state=None, + task_kwargs=None, + ) + ) + + return env_args_list diff --git a/tests/assistantbench/test_evaluation.py b/tests/assistantbench/test_evaluation.py index 8fb6d99f..6c857b70 100644 --- a/tests/assistantbench/test_evaluation.py +++ b/tests/assistantbench/test_evaluation.py @@ -5,7 +5,7 @@ import pytest from browsergym.assistantbench.evaluation.evaluator import question_scorer -from browsergym.experiments.benchmark import task_list_from_metadata, task_metadata +from browsergym.experiments.benchmark.base import task_list_from_metadata, task_metadata __DATA_DIR = pathlib.Path(__file__).resolve().parent / "data" diff --git a/tests/experiments/test_benchmark.py b/tests/experiments/test_benchmark.py index d9f7b25e..c77d2bc8 100644 --- a/tests/experiments/test_benchmark.py +++ b/tests/experiments/test_benchmark.py @@ -7,17 +7,12 @@ from browsergym.core.action.base import AbstractActionSet from browsergym.experiments.agent import Agent from browsergym.experiments.benchmark import ( - BENCHMARKS, + DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs, - _make_env_args_list_from_repeat_tasks, -) -from browsergym.experiments.loop import ( - AbstractAgentArgs, - EnvArgs, - ExpArgs, - get_exp_result, ) +from browsergym.experiments.benchmark.utils import make_env_args_list_from_fixed_seeds +from browsergym.experiments.loop import AbstractAgentArgs, ExpArgs, get_exp_result from browsergym.utils.obs import flatten_axtree_to_str @@ -60,7 +55,7 @@ def test_build_benchmarks(): "workarena_l3_agent_curriculum_eval": 235, "assistantbench": 214, } - for name, benchmark_builder in BENCHMARKS.items(): + for name, benchmark_builder in DEFAULT_BENCHMARKS.items(): benchmark = benchmark_builder() assert name == benchmark.name assert benchmark.env_args_list # non-empty @@ -71,7 +66,7 @@ def test_build_benchmarks(): def test_benchmark_subset(): - benchmark: Benchmark = BENCHMARKS["miniwob"]() + benchmark: Benchmark = DEFAULT_BENCHMARKS["miniwob"]() benchmark_subset = benchmark.subset_from_regexp(column="task_name", regexp="click") assert len(benchmark_subset.env_args_list) == 31 * 5 @@ -105,11 +100,10 @@ def test_run_mock_benchmark(): retry_with_force=False, demo_mode="off", ), - env_args_list=_make_env_args_list_from_repeat_tasks( + env_args_list=make_env_args_list_from_fixed_seeds( task_list=["miniwob.click-test"], max_steps=5, - n_repeats=2, - seeds_rng=np.random.RandomState(42), + fixed_seeds=[0, 1], ), )