ServiceNow · gasse · Oct 18, 2024 · Oct 15, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -139,3 +139,5 @@ error_logs.txt
 tests/results
 tmp.py
 .vscode/settings.json
+
+results/
diff --git a/README.md b/README.md
@@ -161,9 +161,8 @@ print("\n".join(env_ids))
 If you want to experiment with a demo agent in BrowserGym, follow these steps:
 
 ```sh
-cd demo-agent
-conda env create -f environment.yml
-conda activate demo-agent
+conda env create -f demo_agent/environment.yml
+conda activate demo_agent
 # or simply use `pip install -r requirements.txt`
 playwright install chromium
 ```
@@ -172,27 +171,27 @@ Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
 
 Launch the demo agent on the open web:
 ```sh
-python run_demo.py --task_name openended --start_url https://www.google.com
+python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
 ```
 
 Or use it to solve a simple MiniWoB task:
 ```sh
-python run_demo.py --task_name miniwob.click-test
+python demo_agent/run_demo.py --task_name miniwob.click-test
 ```
 
 A VisualWebArena task:
 ```sh
-python run_demo.py --task_name visualwebarena.398
+python demo_agent/run_demo.py --task_name visualwebarena.398
 ```
 
 A WebArena task:
 ```sh
-python run_demo.py --task_name webarena.4
+python demo_agent/run_demo.py --task_name webarena.4
 ```
 
 A WorkArena task:
 ```sh
-python run_demo.py --task_name workarena.servicenow.order-standard-laptop
+python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
 ```
 
 You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more! (see `python run_demo.py --help`)

diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py
@@ -1,8 +1,9 @@
 # these are placeholders
 # all these symbols will be available in browsergym actions
-import playwright.sync_api
 from typing import Literal
 
+import playwright.sync_api
+
 from .utils import (
     add_demo_mode_effects,
     get_elem_by_bid,
@@ -527,7 +528,15 @@ def new_tab():
     # set the new page as the active page
     page = page.context.new_page()
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/api/class-page#page-close
@@ -548,7 +557,15 @@ def tab_close():
     else:
         page = context.new_page()
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/api/class-page#page-bring-to-front
@@ -561,8 +578,17 @@ def tab_focus(index: int):
     """
     global page  # set the focused page as the active page
     page = page.context.pages[index]
+    page.bring_to_front()
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/input#upload-files

diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py
@@ -141,6 +141,9 @@ def __init__(
                 "open_pages_urls": gym.spaces.Sequence(
                     Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
                 ),
+                "open_pages_titles": gym.spaces.Sequence(
+                    Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
+                ),
                 "active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
                 "url": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
                 "screenshot": AnyBox(
@@ -542,8 +545,9 @@ def _get_obs(self):
             "goal": _try_to_extract_legacy_goal(self.goal_object),  # legacy goal, deprecated
             "goal_object": self.goal_object,  # new goal format, list of messages openai style
             "open_pages_urls": [page.url for page in self.context.pages],
+            "open_pages_titles": [page.title() for page in self.context.pages],
             "active_page_index": np.asarray([self.context.pages.index(self.page)]),
-            "url": self.page.url,
+            "url": self.page.url,  # redundant with "open_pages_urls" and "active_page_index"
             "screenshot": extract_screenshot(self.page),
             "dom_object": dom,
             "axtree_object": axtree,

diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
@@ -1,13 +1,14 @@
+import importlib.resources
 import json
 import logging
-import playwright.sync_api
-import importlib.resources
 import pathlib
 import tempfile
-import requests
-
+import urllib.parse
 from typing import Optional, Tuple
 
+import playwright.sync_api
+import requests
+
 from browsergym.core.task import AbstractBrowserTask
 
 from .instance import VisualWebArenaInstance
@@ -226,7 +227,18 @@ def teardown(self) -> None:
     def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
-        # import webarena on instanciation
+
+        # safeguard: check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = ["newtab", ""] + [
+            urllib.parse.urlparse(url).netloc
+            for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
+        ]
+        for open_page in page.context.pages:
+            page_location = urllib.parse.urlparse(open_page.url).netloc
+            if not page_location in authorized_locations:
+                return 0, True, "", {"error": "Unauthorized url, terminating task"}
+
+        # import webarena dynamically
         from visualwebarena.browser_env.actions import ActionTypes
 
         # if any, use the last assistant message as the stop answer for webarena

diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py
@@ -1,12 +1,13 @@
+import importlib.resources
 import json
 import logging
-import numpy as np
-import playwright.sync_api
-import importlib.resources
 import tempfile
-
+import urllib.parse
 from typing import Optional, Tuple
 
+import numpy as np
+import playwright.sync_api
+
 from browsergym.core.task import AbstractBrowserTask
 
 from .instance import WebArenaInstance
@@ -154,7 +155,18 @@ def teardown(self) -> None:
     def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
-        # import webarena on instanciation
+
+        # safeguard: check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = ["newtab", ""] + [
+            urllib.parse.urlparse(url).netloc
+            for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
+        ]
+        for open_page in page.context.pages:
+            page_location = urllib.parse.urlparse(open_page.url).netloc
+            if not page_location in authorized_locations:
+                return 0, True, "", {"error": "Unauthorized url, terminating task"}
+
+        # import webarena dynamically
         from webarena.browser_env.actions import ActionTypes
 
         # if any, use the last assistant message as the stop answer for webarena

diff --git a/demo_agent/basic_agent.py → demo_agent/agent.py b/demo_agent/basic_agent.py → demo_agent/agent.py
@@ -1,14 +1,15 @@
 import base64
 import dataclasses
-import numpy as np
 import io
 import logging
 
+import numpy as np
+import openai
 from PIL import Image
 
-from browsergym.experiments import Agent, AbstractAgentArgs
 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.core.action.python import PythonActionSet
+from browsergym.experiments import AbstractAgentArgs, Agent
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
 
 logger = logging.getLogger(__name__)
@@ -40,6 +41,9 @@ def obs_preprocessor(self, obs: dict) -> dict:
             "goal_object": obs["goal_object"],
             "last_action": obs["last_action"],
             "last_action_error": obs["last_action_error"],
+            "open_pages_urls": obs["open_pages_urls"],
+            "open_pages_titles": obs["open_pages_titles"],
+            "active_page_index": obs["active_page_index"],
             "axtree_txt": flatten_axtree_to_str(obs["axtree_object"]),
             "pruned_html": prune_html(flatten_dom_to_str(obs["dom_object"])),
         }
@@ -63,12 +67,10 @@ def __init__(
         if not (use_html or use_axtree):
             raise ValueError(f"Either use_html or use_axtree must be set to True.")
 
-        from openai import OpenAI
-
-        self.openai_client = OpenAI()
+        self.openai_client = openai.OpenAI()
 
         self.action_set = HighLevelActionSet(
-            subsets=["chat", "bid", "infeas"],  # define a subset of the action space
+            subsets=["chat", "tab", "nav", "bid", "infeas"],  # define a subset of the action space
             # subsets=["chat", "bid", "coord", "infeas"] # allow the agent to also use x,y coordinates
             strict=False,  # less strict on the parsing of the actions
             multiaction=False,  # does not enable the agent to take multiple actions at once
@@ -151,6 +153,29 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
             # goal_object is directly presented as a list of openai-style messages
             user_msgs.extend(obs["goal_object"])
 
+        # append url of all open tabs
+        user_msgs.append(
+            {
+                "type": "text",
+                "text": f"""\
+# Currently open tabs
+""",
+            }
+        )
+        for page_index, (page_url, page_title) in enumerate(
+            zip(obs["open_pages_urls"], obs["open_pages_titles"])
+        ):
+            user_msgs.append(
+                {
+                    "type": "text",
+                    "text": f"""\
+Tab {page_index}{" (active tab)" if page_index == obs["active_page_index"] else ""}
+  Title: {page_title}
+  URL: {page_url}
+""",
+                }
+            )
+
         # append page AXTree (if asked)
         if self.use_axtree:
             user_msgs.append(
@@ -234,6 +259,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
                     {
                         "type": "text",
                         "text": f"""\
+
 {action}
 """,
                     }
@@ -261,7 +287,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
                 "text": f"""\
 # Next action
 
-You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, the current state of the page before deciding on your next action.
+You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, and the current state of the page before deciding on your next action.
 """,
             }
         )

diff --git a/demo_agent/requirements.txt b/demo_agent/requirements.txt
@@ -1,14 +1,2 @@
-browsergym-core>=0.3
-browsergym-experiments>=0.3
-openai>=1.35.4,<1.36
-langchain>=0.2,<0.3
-langchain_openai>=0.1.10,<0.2
-tiktoken
-huggingface_hub
-contexttimer
-ipython
-pyyaml>=6
-pandas
-joblib
-transformers
-langchain_community>=0.2.6,<0.3
+browsergym
+openai
diff --git a/demo_agent/run_demo.py b/demo_agent/run_demo.py
@@ -1,11 +1,11 @@
 import argparse
 
+# locally defined agent
+from agent import DemoAgentArgs
+
 # browsergym experiments utils
 from browsergym.experiments import EnvArgs, ExpArgs, get_exp_result
 
-# locally defined agent
-from basic_agent import DemoAgentArgs
-
 
 def str2bool(v):
     if isinstance(v, bool):

diff --git a/sandbox.py b/sandbox.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+
+from dataclasses_json import DataClassJsonMixin
+
+
+@dataclass
+class Test(DataClassJsonMixin):
+    a: int
+    b: str
+
+    def do_something(self):
+        print(self.a, self.b)
+
+
+x: Test = Test(0, "hello")
+
+x_json = x.to_json()
+
+print(x_json)
+
+y = Test.from_json(x_json)
+
+y.do_something()
diff --git a/sandbox2.py b/sandbox2.py
@@ -0,0 +1,22 @@
+from browsergym.workarena import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    TASK_CATEGORY_MAP,
+)
+
+metadata = []
+
+for task_name, category in TASK_CATEGORY_MAP.items():
+    metadata.append((task_name, "l1", category))
+
+for category, items in AGENT_CURRICULUM_L2.items():
+    for task_set in items["buckets"]:
+        for task in task_set:
+            metadata.append((task.get_task_id(), "l2", category))
+
+for category, items in AGENT_CURRICULUM_L3.items():
+    for task_set in items["buckets"]:
+        for task in task_set:
+            metadata.append((task.get_task_id(), "l3", category))
+
+print("\n".join([",".join(x) for x in metadata]))
diff --git a/sandbox3.py b/sandbox3.py