Skip to content

Commit

Permalink
benchmark prepare_backend refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
gasse committed Oct 24, 2024
1 parent 67b9686 commit 5c6654b
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 26 deletions.
81 changes: 65 additions & 16 deletions browsergym/experiments/src/browsergym/experiments/benchmark/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import fnmatch
import logging
import typing
from dataclasses import dataclass, field
from typing import Literal, Optional

Expand Down Expand Up @@ -49,13 +50,16 @@ def make_action_set(self):
)


BenchmarkBackend = Literal["miniwob", "webarena", "visualwebarena", "workarena", "assistantbench"]


@dataclass
class Benchmark(DataClassJsonMixin):
name: str
high_level_action_set_args: HighLevelActionSetArgs
is_multi_tab: bool
env_args_list: list[EnvArgs]
full_reset_script: Optional[str]
backends: list[BenchmarkBackend]
task_metadata: Optional[pd.DataFrame] = field(
default_factory=lambda: None,
metadata=config(
Expand All @@ -74,6 +78,57 @@ def __post_init__(self):
# make sure all tasks in env_args are in the metadata
metadata_tasks = list(self.task_metadata["task_name"])
assert all([env_args.task_name in metadata_tasks for env_args in self.env_args_list])
# check backend values
assert all([backend in typing.get_args(BenchmarkBackend) for backend in self.backends])

def prepare_backends(self):
for backend in self.backends:
match backend:
case "miniwob":
# register environments
import browsergym.miniwob

# check setup
browsergym.miniwob.environment_variables_precheck()

case "webarena":
# register environments
import browsergym.webarena

# full reset the instance (requires environment variables properly set up)
from browsergym.webarena.instance import WebArenaInstance

default_instance = WebArenaInstance()
default_instance.full_reset()

case "visualwebarena":
# register environments
import browsergym.visualwebarena

# full reset the instance (requires environment variables properly set up)
from browsergym.visualwebarena.instance import (
VisualWebArenaInstance,
)

default_instance = VisualWebArenaInstance()
default_instance.full_reset()

case "workarena":
# register environments
import browsergym.workarena

# check server status
from browsergym.workarena.instance import SNowInstance

default_instance = SNowInstance()
default_instance.check_status()

case "assistantbench":
# register environments
import browsergym.assistantbench

case _:
raise ValueError(f"Unknown benchmark backend {repr(backend)}")

def subset_from_split(self, split: Literal["train", "valid", "test"]):
split_column = "browsergym_split"
Expand Down Expand Up @@ -107,7 +162,7 @@ def subset_from_regexp(self, column, regexp):
name=f"{self.name}[{column}=/{regexp}/]",
high_level_action_set_args=self.high_level_action_set_args,
is_multi_tab=self.is_multi_tab,
full_reset_script=self.full_reset_script,
backends=self.backends,
env_args_list=[
env_args
for env_args in self.env_args_list
Expand Down Expand Up @@ -193,7 +248,7 @@ def subset_from_regexp(self, column, regexp):
name="miniwob",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"],
is_multi_tab=False,
full_reset_script=None,
backends=["miniwob"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("miniwob")),
max_steps=10,
Expand All @@ -206,7 +261,7 @@ def subset_from_regexp(self, column, regexp):
name="miniwob_tiny_test",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"],
is_multi_tab=False,
full_reset_script=None,
backends=["miniwob"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"],
max_steps=5,
Expand All @@ -219,10 +274,7 @@ def subset_from_regexp(self, column, regexp):
name="webarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
is_multi_tab=True,
full_reset_script="""\
import browsergym.webarena.instance
browsergym.webarena.instance.WebArenaInstance().full_reset()
""",
backends=["webarena"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("webarena")),
max_steps=15,
Expand All @@ -235,10 +287,7 @@ def subset_from_regexp(self, column, regexp):
name="visualwebarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"],
is_multi_tab=True,
full_reset_script="""\
import browsergym.visualwebarena.instance
browsergym.visualwebarena.instance.VisualWebArenaInstance().full_reset()
""",
backends=["visualwebarena"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("visualwebarena")),
max_steps=15,
Expand All @@ -251,7 +300,7 @@ def subset_from_regexp(self, column, regexp):
name="workarena_l1",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
is_multi_tab=False,
full_reset_script=None,
backends=["workarena"],
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l1",
task_category_filter=None,
Expand All @@ -266,7 +315,7 @@ def subset_from_regexp(self, column, regexp):
name="workarena_l2_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"],
is_multi_tab=True,
full_reset_script=None,
backends=["workarena"],
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l2",
task_category_filter=None,
Expand All @@ -280,7 +329,7 @@ def subset_from_regexp(self, column, regexp):
name="workarena_l3_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"],
is_multi_tab=True,
full_reset_script=None,
backends=["workarena"],
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l3",
task_category_filter=None,
Expand All @@ -294,7 +343,7 @@ def subset_from_regexp(self, column, regexp):
name="assistantbench",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["assistantbench"],
is_multi_tab=True,
full_reset_script=None,
backends=["assistantbench"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"}
Expand Down
9 changes: 9 additions & 0 deletions browsergym/miniwob/src/browsergym/miniwob/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import os

from browsergym.core.registration import register_task

from . import all


def environment_variables_precheck():
assert os.environ.get(
"MINIWOB_URL", None
), "Environment variable MINIWOB_URL has not been setup."


ALL_MINIWOB_TASKS = [
all.AscendingNumbersTask,
all.BisectAngleTask,
Expand Down
1 change: 1 addition & 0 deletions browsergym/miniwob/src/browsergym/miniwob/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, Tuple

import playwright.sync_api

from browsergym.core.task import AbstractBrowserTask


Expand Down
20 changes: 17 additions & 3 deletions browsergym/webarena/src/browsergym/webarena/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,35 @@ def full_reset(self):
f"Full instance reset failed ({response.status_code}): {response.status_code}"
)

# warm-start the instance (navigate to every domain)
retries_left = 3
while retries_left:
retries_left -= 1
try:
self._check_is_reachable(timeout=60) # 60 seconds, cold starting might be slow
break
except Exception as e:
if not retries_left:
raise
logger.info(
f"Instance unresponsive after reset, retrying ({retries_left} retries left)\n{e}"
)

def check_status(self):
"""
Check the status of the instance. Raises an error if the instance is not ready to be used.
"""
self._check_is_reachable()
self._check_is_reachable(timeout=10) # 10 seconds

def _check_is_reachable(self):
def _check_is_reachable(self, timeout: int):
"""
Test that every website is reachable.
"""
for site, url in self.urls.items():
try:
requests.get(url, timeout=5000) # 5 secs
requests.get(url, timeout=timeout)
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
raise RuntimeError(
f'WebArena site "{site}" ({url}) is not reacheable. Please check the URL.'
Expand Down
32 changes: 25 additions & 7 deletions tests/experiments/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,39 @@ def test_benchmark_subset():
assert dict_1 == dict_2


def test_miniwob_benchmark_reset():
MINIWOB_URL = os.environ["MINIWOB_URL"]
try:
benchmark: Benchmark = DEFAULT_BENCHMARKS["miniwob"]()

benchmark.prepare_backends()

del os.environ["MINIWOB_URL"]
with pytest.raises(Exception):
benchmark.prepare_backends()

os.environ["MINIWOB_URL"] = ""
with pytest.raises(Exception):
benchmark.prepare_backends()
finally:
os.environ["MINIWOB_URL"] = MINIWOB_URL


@pytest.mark.skip
def test_webarena_benchmark_reset():
WA_FULL_RESET = os.environ["WA_FULL_RESET"]
try:
benchmark: Benchmark = DEFAULT_BENCHMARKS["webarena"]()

exec(benchmark.full_reset_script)
benchmark.prepare_backends()

del os.environ["WA_FULL_RESET"]
with pytest.raises(Exception):
exec(benchmark.full_reset_script)
benchmark.prepare_backends()

os.environ["WA_FULL_RESET"] = "http://localhost:12345/reset"
with pytest.raises(Exception):
exec(benchmark.full_reset_script)
benchmark.prepare_backends()
finally:
os.environ["WA_FULL_RESET"] = WA_FULL_RESET

Expand All @@ -117,15 +135,15 @@ def test_visualwebarena_benchmark_reset():
try:
benchmark: Benchmark = DEFAULT_BENCHMARKS["visualwebarena"]()

exec(benchmark.full_reset_script)
benchmark.prepare_backends()

del os.environ["VWA_FULL_RESET"]
with pytest.raises(Exception):
exec(benchmark.full_reset_script)
benchmark.prepare_backends()

os.environ["VWA_FULL_RESET"] = "http://localhost:12345/reset"
with pytest.raises(Exception):
exec(benchmark.full_reset_script)
benchmark.prepare_backends()
finally:
os.environ["VWA_FULL_RESET"] = VWA_FULL_RESET

Expand All @@ -141,7 +159,7 @@ def test_run_mock_benchmark():
demo_mode="off",
),
is_multi_tab=False,
full_reset_script=None,
backends=None,
env_args_list=make_env_args_list_from_fixed_seeds(
task_list=["miniwob.click-test"],
max_steps=5,
Expand Down

0 comments on commit 5c6654b

Please sign in to comment.