Skip to content

Commit

Permalink
becnhmark code split refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
gasse committed Oct 23, 2024
1 parent 33d57a0 commit fe219bb
Show file tree
Hide file tree
Showing 14 changed files with 159 additions and 109 deletions.
6 changes: 5 additions & 1 deletion browsergym/experiments/src/bgym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
from browsergym.experiments.agent import Agent, AgentInfo
from browsergym.experiments.benchmark import Benchmark, HighLevelActionSetArgs, BENCHMARKS
from browsergym.experiments.benchmark import (
DEFAULT_BENCHMARKS,
Benchmark,
HighLevelActionSetArgs,
)
from browsergym.experiments.loop import (
AbstractAgentArgs,
EnvArgs,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .base import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import fnmatch
import io
import logging
import pkgutil
from dataclasses import dataclass, field
from typing import Literal, Optional

Expand All @@ -10,7 +8,13 @@
from dataclasses_json import DataClassJsonMixin, config

from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.experiments.loop import SEED_MAX, EnvArgs
from browsergym.experiments.loop import EnvArgs

from .metadata.utils import task_list_from_metadata, task_metadata
from .utils import (
make_env_args_list_from_repeat_tasks,
make_env_args_list_from_workarena_curriculum,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,6 +53,7 @@ def make_action_set(self):
class Benchmark(DataClassJsonMixin):
name: str
high_level_action_set_args: HighLevelActionSetArgs
is_multi_tab: bool
env_args_list: list[EnvArgs]
task_metadata: Optional[pd.DataFrame] = field(
default_factory=lambda: None,
Expand Down Expand Up @@ -109,28 +114,6 @@ def subset_from_regexp(self, column, regexp):
)


def task_metadata(benchmark_name: str):
return task_metadata_from_csv(
io.StringIO(
pkgutil.get_data(__name__, f"task_metadata/{benchmark_name}.csv").decode("utf-8")
)
)


def task_metadata_from_csv(filepath):
return pd.read_csv(filepath).fillna("")


def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}):
df = metadata
# filter the desired columns (AND filter)
for col_name, regex in filter.items():
col_filter = df[col_name].astype(str).str.contains(regex, regex=True)
df = df[col_filter]
# return only the task names
return list(df["task_name"])


# These are mean as the default highlevel action set to fairly evaluate agents on each benchmark.
# They are mostly arbitrary, the important thing is to evaluate different agents using the same action set for fairness.
DEFAULT_HIGHLEVEL_ACTION_SET_ARGS = {
Expand Down Expand Up @@ -197,12 +180,13 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
),
}

# all benchmarks are callables designed for lazy loading, i.e. `bench = BENCHMARKS["miniwob_all"]()`
BENCHMARKS = {
# all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()`
DEFAULT_BENCHMARKS = {
"miniwob": lambda: Benchmark(
name="miniwob",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=False,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("miniwob")),
max_steps=10,
n_repeats=5,
Expand All @@ -213,7 +197,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"miniwob_tiny_test": lambda: Benchmark(
name="miniwob_tiny_test",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=False,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"],
max_steps=5,
n_repeats=2,
Expand All @@ -224,7 +209,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"webarena": lambda: Benchmark(
name="webarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("webarena")),
max_steps=15,
n_repeats=1,
Expand All @@ -235,7 +221,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"visualwebarena": lambda: Benchmark(
name="visualwebarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("visualwebarena")),
max_steps=15,
n_repeats=1,
Expand All @@ -246,7 +233,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l1": lambda: Benchmark(
name="workarena_l1",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena_l1"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=False,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l1",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -259,7 +247,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l2_agent_curriculum_eval": lambda: Benchmark(
name="workarena_l2_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=True,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l2",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -271,7 +260,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l3_agent_curriculum_eval": lambda: Benchmark(
name="workarena_l3_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=True,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l3",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -283,7 +273,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"assistantbench": lambda: Benchmark(
name="assistantbench",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["assistantbench"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"}
),
Expand All @@ -294,62 +285,3 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
task_metadata=task_metadata("assistantbench"),
),
}


def _make_env_args_list_from_workarena_curriculum(
level: Literal["l1", "l2", "l3"],
task_category_filter: str,
meta_seed: int,
max_steps: int,
curriculum_type: Literal["human", "agent"],
seeds_l1: int = 10,
):
"""
Returns a WorkArena predefined task curriculum (e.g., task and seed combination).
"""
assert level in ("l1", "l2", "l3")
assert curriculum_type in ("human", "agent")

env_args_list = []

from browsergym.workarena import get_all_tasks_agents

all_task_tuples = get_all_tasks_agents(
filter=f"{level}.{task_category_filter}" if task_category_filter else level,
meta_seed=meta_seed,
is_agent_curriculum=(curriculum_type == "agent"),
n_seed_l1=seeds_l1,
)

for task, seed in all_task_tuples:
task_name = task.get_task_id()
env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps))

return env_args_list


def _make_env_args_list_from_repeat_tasks(
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats):
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import io
import pkgutil

import pandas as pd


def task_metadata(benchmark_name: str):
return task_metadata_from_csv(
io.StringIO(pkgutil.get_data(__name__, f"{benchmark_name}.csv").decode("utf-8"))
)


def task_metadata_from_csv(filepath):
return pd.read_csv(filepath).fillna("")


def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}):
df = metadata
# filter the desired columns (AND filter)
for col_name, regex in filter.items():
col_filter = df[col_name].astype(str).str.contains(regex, regex=True)
df = df[col_filter]
# return only the task names
return list(df["task_name"])
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import io
import pkgutil
from typing import Literal

import numpy as np
import pandas as pd

from browsergym.experiments.loop import SEED_MAX, EnvArgs


def make_env_args_list_from_workarena_curriculum(
level: Literal["l1", "l2", "l3"],
task_category_filter: str,
meta_seed: int,
max_steps: int,
curriculum_type: Literal["human", "agent"],
seeds_l1: int = 10,
):
"""
Returns a WorkArena predefined task curriculum (e.g., task and seed combination).
"""
assert level in ("l1", "l2", "l3")
assert curriculum_type in ("human", "agent")

env_args_list = []

# dynamic import
from browsergym.workarena import get_all_tasks_agents

all_task_tuples = get_all_tasks_agents(
filter=f"{level}.{task_category_filter}" if task_category_filter else level,
meta_seed=meta_seed,
is_agent_curriculum=(curriculum_type == "agent"),
n_seed_l1=seeds_l1,
)

for task, seed in all_task_tuples:
task_name = task.get_task_id()
env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps))

return env_args_list


def make_env_args_list_from_repeat_tasks(
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats):
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list


def make_env_args_list_from_fixed_seeds(
task_list: list[str], max_steps: int, fixed_seeds: list[int]
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in fixed_seeds:
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list
2 changes: 1 addition & 1 deletion tests/assistantbench/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from browsergym.assistantbench.evaluation.evaluator import question_scorer
from browsergym.experiments.benchmark import task_list_from_metadata, task_metadata
from browsergym.experiments.benchmark.base import task_list_from_metadata, task_metadata

__DATA_DIR = pathlib.Path(__file__).resolve().parent / "data"

Expand Down
Loading

0 comments on commit fe219bb

Please sign in to comment.