From d078889948f5e712664167fc47885ec9df03fd46 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Wed, 23 Oct 2024 11:45:12 -0400 Subject: [PATCH 1/4] default retry_with_force=True --- .../src/browsergym/experiments/benchmark/base.py | 16 ++++++++-------- tests/experiments/test_benchmark.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py index 77c78916..64a4aa4d 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py @@ -123,7 +123,7 @@ def subset_from_regexp(self, column, regexp): subsets=["bid", "coord"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), # loosely from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160 @@ -131,7 +131,7 @@ def subset_from_regexp(self, column, regexp): subsets=["bid"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), # loosely from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173 @@ -139,21 +139,21 @@ def subset_from_regexp(self, column, regexp): subsets=["coord"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), "workarena_l1": HighLevelActionSetArgs( subsets=["chat", "bid"], # no need for infeasible action multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), "workarena": HighLevelActionSetArgs( subsets=["chat", "infeas", "bid", "tab", "nav"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), # from https://arxiv.org/abs/2307.13854 @@ -161,7 +161,7 @@ def subset_from_regexp(self, column, regexp): subsets=["chat", "infeas", "bid", "tab", "nav"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), # from https://arxiv.org/abs/2401.13649 @@ -169,14 +169,14 @@ def subset_from_regexp(self, column, regexp): subsets=["chat", "infeas", "bid", "tab", "nav"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), "assistantbench": HighLevelActionSetArgs( subsets=["chat", "bid", "tab", "nav"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), } diff --git a/tests/experiments/test_benchmark.py b/tests/experiments/test_benchmark.py index 3911f678..a696c6e6 100644 --- a/tests/experiments/test_benchmark.py +++ b/tests/experiments/test_benchmark.py @@ -97,7 +97,7 @@ def test_run_mock_benchmark(): subsets=["bid"], multiaction=False, strict=False, - retry_with_force=False, + retry_with_force=True, demo_mode="off", ), is_multi_tab=False, From a164f121d1087d52587ddb9d37414de2700a142a Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Wed, 23 Oct 2024 12:51:13 -0400 Subject: [PATCH 2/4] paper-specific action subsets --- .../src/browsergym/core/action/highlevel.py | 238 ++++++++++++------ .../browsergym/experiments/benchmark/base.py | 42 ++-- 2 files changed, 191 insertions(+), 89 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 241c131f..8a752da2 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -1,7 +1,7 @@ import inspect import random +import typing from dataclasses import dataclass -from typing import Literal, Optional from . import utils from .base import AbstractActionSet @@ -41,50 +41,153 @@ ) from .parsers import action_docstring_parser, highlevel_action_parser -CHAT_ACTIONS = [send_msg_to_user] - -INFEAS_ACTIONS = [report_infeasible] - -BID_ACTIONS = [ - scroll, - fill, - # These are not really needed and might pollute the action space, doing more harm than good - # check, - # uncheck, - select_option, - click, - dblclick, - hover, - press, - focus, - clear, - drag_and_drop, - upload_file, -] - -COORD_ACTIONS = [ - scroll, - mouse_move, - mouse_up, - mouse_down, - mouse_click, - mouse_dblclick, - mouse_drag_and_drop, - mouse_upload_file, - keyboard_down, - keyboard_up, - keyboard_press, - keyboard_type, - keyboard_insert_text, -] - -NAV_ACTIONS = [go_back, go_forward, goto] - -TAB_ACTIONS = [ - tab_close, - tab_focus, - new_tab, -] +ACTION_SUBSETS = { + "chat": [send_msg_to_user], + "infeas": [report_infeasible], + "bid": [ + scroll, + fill, + # These are not really needed and might pollute the action space, doing more harm than good + # check, + # uncheck, + select_option, + click, + dblclick, + hover, + press, + focus, + clear, + drag_and_drop, + upload_file, + ], + "coord": [ + scroll, + mouse_move, + mouse_up, + mouse_down, + mouse_click, + mouse_dblclick, + mouse_drag_and_drop, + mouse_upload_file, + keyboard_down, + keyboard_up, + keyboard_press, + keyboard_type, + keyboard_insert_text, + ], + "nav": [go_back, go_forward, goto], + "tab": [ + tab_close, + tab_focus, + new_tab, + ], + # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122 + "miniwob_all": [ + mouse_move, # MOVE_COORDS + mouse_click, # CLICK_COORDS + mouse_dblclick, # DBLCLICK_COORDS + mouse_down, # MOUSEDOWN_COORDS + mouse_up, # MOUSEUP_COORDS + scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS + click, # CLICK_ELEMENT + keyboard_press, # PRESS_KEY + keyboard_type, # TYPE_TEX (and substitute for TYPE_FIELD() + fill, # FOCUS_ELEMENT_AND_TYPE_TEXT (and substitute for FOCUS_ELEMENT_AND_TYPE_FIELD) + ], + # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142 + "miniwob_shi17": [ + mouse_click, # CLICK_COORDS + mouse_dblclick, # DBLCLICK_COORDS + mouse_down, # MOUSEDOWN_COORDS + mouse_up, # MOUSEUP_COORDS + scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS + keyboard_press, # PRESS_KEY + ], + # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160 + "miniwob_liu18": [ + click, # CLICK_ELEMENT + fill, # substitute for FOCUS_ELEMENT_AND_TYPE_FIELD + ], + # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173 + "miniwob_humphreys22": [ + mouse_move, # MOVE_COORDS + mouse_click, # CLICK_COORDS + mouse_dblclick, # DBLCLICK_COORDS + mouse_down, # MOUSEDOWN_COORDS + mouse_up, # MOUSEUP_COORDS + scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS + keyboard_press, # PRESS_KEY + keyboard_type, # substitute for TYPE_FIELD + ], + # from webarena paper https://arxiv.org/abs/2307.13854 + "webarena": [ + click, # click(elem) + hover, # hover(elem) + fill, # type(elem, text) + keyboard_press, # press(key_comb) + scroll, # scroll(dir) + tab_focus, # tab_focus(index) + new_tab, # new_tab() + tab_close, # tab_close() + go_back, # go_back() + go_forward, # go_forward() + goto, # goto(url) + send_msg_to_user, # + report_infeasible, # explicit unachievable action, equivalent to "N/A" answer + ], + # from visualwebarena paper https://arxiv.org/abs/2401.13649 + "visualwebarena": [ + click, # click(elem) + hover, # hover(elem) + fill, # type(elem, text) + keyboard_press, # press(key_comb) + scroll, # scroll(dir) + tab_focus, # tab_focus(index) + new_tab, # new_tab() + tab_close, # tab_close() + go_back, # go_back() + go_forward, # go_forward() + goto, # goto(url) + send_msg_to_user, # stop(answer) + report_infeasible, # explicit unachievable action, equivalent to "N/A" answer + upload_file, # + ], + # from workarena paper https://arxiv.org/abs/2403.07718 + "workarena": [ + scroll, + fill, + select_option, + click, + dblclick, + hover, + press, + focus, + clear, + drag_and_drop, + send_msg_to_user, + ], + # from workarena++ paper https://arxiv.org/abs/2407.05291 + "workarena++": [ + scroll, + fill, + select_option, + click, + dblclick, + hover, + press, + focus, + clear, + drag_and_drop, + tab_focus, + new_tab, + tab_close, + go_back, + go_forward, + goto, + send_msg_to_user, + report_infeasible, + ], +} @dataclass @@ -98,20 +201,21 @@ class HighLevelAction: class HighLevelActionSet(AbstractActionSet): # static class variables - ActionSubset = Literal["chat", "infeas", "bid", "coord", "nav", "tab", "custom"] + ActionSubset = typing.Literal["chat", "infeas", "bid", "coord", "nav", "tab", "custom"] + DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"] def __init__( self, - subsets: Optional[ActionSubset | list[ActionSubset]] = [ + subsets: typing.Optional[ActionSubset | list[ActionSubset]] = [ "chat", "infeas", "bid", "nav", "tab", ], - custom_actions: Optional[list[callable]] = None, + custom_actions: typing.Optional[list[callable]] = None, multiaction: bool = True, - demo_mode: Optional[Literal["off", "default", "all_blue", "only_visible_elements"]] = None, + demo_mode: typing.Optional[DemoMode] = None, strict: bool = False, retry_with_force: bool = False, ): @@ -131,27 +235,16 @@ def __init__( # add actions from specified action sets if subsets: for subset in subsets: - match subset: - case "chat": - allowed_actions.extend(CHAT_ACTIONS) - case "infeas": - allowed_actions.extend(INFEAS_ACTIONS) - case "bid": - allowed_actions.extend(BID_ACTIONS) - case "coord": - allowed_actions.extend(COORD_ACTIONS) - case "nav": - allowed_actions.extend(NAV_ACTIONS) - case "tab": - allowed_actions.extend(TAB_ACTIONS) - case "custom": - if not custom_actions: - raise ValueError( - "'custom' is in 'action_subsets' but 'custom_actions' is empty." - ) - allowed_actions.extend(custom_actions) - case _: - raise ValueError(f"Unknown high-level action subspace: {subset}") + if subset in ACTION_SUBSETS: + allowed_actions.extend(ACTION_SUBSETS[subset]) + elif subset == "custom": + if not custom_actions: + raise ValueError( + "'custom' is in 'action_subsets' but 'custom_actions' is empty." + ) + allowed_actions.extend(custom_actions) + else: + raise ValueError(f"Unknown high-level action subspace: {subset}") # like set() but preserves order # https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set @@ -344,3 +437,8 @@ def to_python_code(self, action): # return the constructed python code return python_code + + +# consistency checks +assert "custom" not in ACTION_SUBSETS +assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set(ACTION_SUBSETS.keys()) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py index 64a4aa4d..b3d49200 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py @@ -118,39 +118,43 @@ def subset_from_regexp(self, column, regexp): # These are mean as the default highlevel action set to fairly evaluate agents on each benchmark. # They are mostly arbitrary, the important thing is to evaluate different agents using the same action set for fairness. DEFAULT_HIGHLEVEL_ACTION_SET_ARGS = { - # loosely from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122 - "miniwob": HighLevelActionSetArgs( - subsets=["bid", "coord"], + "miniwob_all": HighLevelActionSetArgs( + subsets=["miniwob_all"], multiaction=False, strict=False, retry_with_force=True, demo_mode="off", ), - # loosely from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160 - "miniwob_bid": HighLevelActionSetArgs( - subsets=["bid"], + "miniwob_liu18": HighLevelActionSetArgs( + subsets=["miniwob_liu18"], multiaction=False, strict=False, retry_with_force=True, demo_mode="off", ), - # loosely from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173 - "miniwob_coord": HighLevelActionSetArgs( - subsets=["coord"], + "miniwob_shi17": HighLevelActionSetArgs( + subsets=["miniwob_shi17"], multiaction=False, strict=False, retry_with_force=True, demo_mode="off", ), - "workarena_l1": HighLevelActionSetArgs( - subsets=["chat", "bid"], # no need for infeasible action + "miniwob_humphreys22": HighLevelActionSetArgs( + subsets=["miniwob_humphreys22"], multiaction=False, strict=False, retry_with_force=True, demo_mode="off", ), "workarena": HighLevelActionSetArgs( - subsets=["chat", "infeas", "bid", "tab", "nav"], + subsets=["workarena"], # no need for infeasible action + multiaction=False, + strict=False, + retry_with_force=True, + demo_mode="off", + ), + "workarena++": HighLevelActionSetArgs( + subsets=["workarena++"], multiaction=False, strict=False, retry_with_force=True, @@ -158,7 +162,7 @@ def subset_from_regexp(self, column, regexp): ), # from https://arxiv.org/abs/2307.13854 "webarena": HighLevelActionSetArgs( - subsets=["chat", "infeas", "bid", "tab", "nav"], + subsets=["webarena"], multiaction=False, strict=False, retry_with_force=True, @@ -166,7 +170,7 @@ def subset_from_regexp(self, column, regexp): ), # from https://arxiv.org/abs/2401.13649 "visualwebarena": HighLevelActionSetArgs( - subsets=["chat", "infeas", "bid", "tab", "nav"], + subsets=["visualwebarena"], multiaction=False, strict=False, retry_with_force=True, @@ -185,7 +189,7 @@ def subset_from_regexp(self, column, regexp): DEFAULT_BENCHMARKS = { "miniwob": lambda: Benchmark( name="miniwob", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"], is_multi_tab=False, env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("miniwob")), @@ -197,7 +201,7 @@ def subset_from_regexp(self, column, regexp): ), "miniwob_tiny_test": lambda: Benchmark( name="miniwob_tiny_test", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob_all"], is_multi_tab=False, env_args_list=make_env_args_list_from_repeat_tasks( task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"], @@ -233,7 +237,7 @@ def subset_from_regexp(self, column, regexp): ), "workarena_l1": lambda: Benchmark( name="workarena_l1", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena_l1"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], is_multi_tab=False, env_args_list=make_env_args_list_from_workarena_curriculum( level="l1", @@ -247,7 +251,7 @@ def subset_from_regexp(self, column, regexp): ), "workarena_l2_agent_curriculum_eval": lambda: Benchmark( name="workarena_l2_agent_curriculum_eval", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"], is_multi_tab=True, env_args_list=make_env_args_list_from_workarena_curriculum( level="l2", @@ -260,7 +264,7 @@ def subset_from_regexp(self, column, regexp): ), "workarena_l3_agent_curriculum_eval": lambda: Benchmark( name="workarena_l3_agent_curriculum_eval", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena++"], is_multi_tab=True, env_args_list=make_env_args_list_from_workarena_curriculum( level="l3", From 77b16ccbf9d4d80e989edf21e4b07db796172740 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Wed, 23 Oct 2024 12:56:34 -0400 Subject: [PATCH 3/4] fixes --- .../src/browsergym/core/action/highlevel.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 8a752da2..2fca7fb6 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -201,7 +201,23 @@ class HighLevelAction: class HighLevelActionSet(AbstractActionSet): # static class variables - ActionSubset = typing.Literal["chat", "infeas", "bid", "coord", "nav", "tab", "custom"] + ActionSubset = typing.Literal[ + "chat", + "infeas", + "bid", + "coord", + "nav", + "tab", + "miniwob_all", + "miniwob_shi17", + "miniwob_liu18", + "miniwob_humphreys22", + "webarena", + "visualwebarena", + "workarena", + "workarena++", + "custom", + ] DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"] def __init__( @@ -441,4 +457,6 @@ def to_python_code(self, action): # consistency checks assert "custom" not in ACTION_SUBSETS -assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set(ACTION_SUBSETS.keys()) +assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set( + list(ACTION_SUBSETS.keys()) + ["custom"] +) From bd3678204cb7ab182852742b85876bc1bbc17808 Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Wed, 23 Oct 2024 13:33:49 -0400 Subject: [PATCH 4/4] comments --- .../src/browsergym/core/action/highlevel.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 2fca7fb6..2e4611dc 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -81,7 +81,8 @@ tab_focus, new_tab, ], - # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122 + # adapted from MiniWoB repo + # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122 "miniwob_all": [ mouse_move, # MOVE_COORDS mouse_click, # CLICK_COORDS @@ -94,7 +95,8 @@ keyboard_type, # TYPE_TEX (and substitute for TYPE_FIELD() fill, # FOCUS_ELEMENT_AND_TYPE_TEXT (and substitute for FOCUS_ELEMENT_AND_TYPE_FIELD) ], - # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142 + # adapted from MiniWoB repo + # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142 "miniwob_shi17": [ mouse_click, # CLICK_COORDS mouse_dblclick, # DBLCLICK_COORDS @@ -103,12 +105,14 @@ scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS keyboard_press, # PRESS_KEY ], - # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160 + # adapted from MiniWoB repo + # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160 "miniwob_liu18": [ click, # CLICK_ELEMENT fill, # substitute for FOCUS_ELEMENT_AND_TYPE_FIELD ], - # loosely taken from https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173 + # adapted from MiniWoB repo + # https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173 "miniwob_humphreys22": [ mouse_move, # MOVE_COORDS mouse_click, # CLICK_COORDS @@ -119,7 +123,8 @@ keyboard_press, # PRESS_KEY keyboard_type, # substitute for TYPE_FIELD ], - # from webarena paper https://arxiv.org/abs/2307.13854 + # from webarena paper + # https://arxiv.org/abs/2307.13854 "webarena": [ click, # click(elem) hover, # hover(elem) @@ -135,7 +140,8 @@ send_msg_to_user, # report_infeasible, # explicit unachievable action, equivalent to "N/A" answer ], - # from visualwebarena paper https://arxiv.org/abs/2401.13649 + # from visualwebarena paper + # https://arxiv.org/abs/2401.13649 "visualwebarena": [ click, # click(elem) hover, # hover(elem) @@ -152,7 +158,8 @@ report_infeasible, # explicit unachievable action, equivalent to "N/A" answer upload_file, # ], - # from workarena paper https://arxiv.org/abs/2403.07718 + # from workarena paper + # https://arxiv.org/abs/2403.07718 "workarena": [ scroll, fill, @@ -166,7 +173,8 @@ drag_and_drop, send_msg_to_user, ], - # from workarena++ paper https://arxiv.org/abs/2407.05291 + # from workarena++ paper + # https://arxiv.org/abs/2407.05291 "workarena++": [ scroll, fill,