Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi-tab fix #188

Merged
merged 10 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,5 @@ error_logs.txt
tests/results
tmp.py
.vscode/settings.json

results/
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,8 @@ print("\n".join(env_ids))
If you want to experiment with a demo agent in BrowserGym, follow these steps:

```sh
cd demo-agent
conda env create -f environment.yml
conda activate demo-agent
conda env create -f demo_agent/environment.yml
conda activate demo_agent
# or simply use `pip install -r requirements.txt`
playwright install chromium
```
Expand All @@ -172,27 +171,27 @@ Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.

Launch the demo agent on the open web:
```sh
python run_demo.py --task_name openended --start_url https://www.google.com
python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
```

Or use it to solve a simple MiniWoB task:
```sh
python run_demo.py --task_name miniwob.click-test
python demo_agent/run_demo.py --task_name miniwob.click-test
```

A VisualWebArena task:
```sh
python run_demo.py --task_name visualwebarena.398
python demo_agent/run_demo.py --task_name visualwebarena.398
```

A WebArena task:
```sh
python run_demo.py --task_name webarena.4
python demo_agent/run_demo.py --task_name webarena.4
```

A WorkArena task:
```sh
python run_demo.py --task_name workarena.servicenow.order-standard-laptop
python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
```

You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more! (see `python run_demo.py --help`)
Expand Down
34 changes: 30 additions & 4 deletions browsergym/core/src/browsergym/core/action/functions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# these are placeholders
# all these symbols will be available in browsergym actions
import playwright.sync_api
from typing import Literal

import playwright.sync_api

from .utils import (
add_demo_mode_effects,
get_elem_by_bid,
Expand Down Expand Up @@ -527,7 +528,15 @@ def new_tab():
# set the new page as the active page
page = page.context.new_page()
# trigger the callback that sets this page as active in browsergym
page.locate("html").dispatch_event("pageshow")
page.evaluate(
"""\
const event = new Event('pageshow', {
bubbles: true, // Whether the event bubbles up through the DOM or not
cancelable: false // Whether the event can be canceled
});
window.dispatchEvent(event);
"""
)


# https://playwright.dev/python/docs/api/class-page#page-close
Expand All @@ -548,7 +557,15 @@ def tab_close():
else:
page = context.new_page()
# trigger the callback that sets this page as active in browsergym
page.locate("html").dispatch_event("pageshow")
page.evaluate(
"""\
const event = new Event('pageshow', {
bubbles: true, // Whether the event bubbles up through the DOM or not
cancelable: false // Whether the event can be canceled
});
window.dispatchEvent(event);
"""
)


# https://playwright.dev/python/docs/api/class-page#page-bring-to-front
Expand All @@ -561,8 +578,17 @@ def tab_focus(index: int):
"""
global page # set the focused page as the active page
page = page.context.pages[index]
page.bring_to_front()
# trigger the callback that sets this page as active in browsergym
page.locate("html").dispatch_event("pageshow")
page.evaluate(
"""\
const event = new Event('pageshow', {
bubbles: true, // Whether the event bubbles up through the DOM or not
cancelable: false // Whether the event can be canceled
});
window.dispatchEvent(event);
"""
)


# https://playwright.dev/python/docs/input#upload-files
Expand Down
6 changes: 5 additions & 1 deletion browsergym/core/src/browsergym/core/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ def __init__(
"open_pages_urls": gym.spaces.Sequence(
Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
),
"open_pages_titles": gym.spaces.Sequence(
Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
),
"active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
"url": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
"screenshot": AnyBox(
Expand Down Expand Up @@ -542,8 +545,9 @@ def _get_obs(self):
"goal": _try_to_extract_legacy_goal(self.goal_object), # legacy goal, deprecated
"goal_object": self.goal_object, # new goal format, list of messages openai style
"open_pages_urls": [page.url for page in self.context.pages],
"open_pages_titles": [page.title() for page in self.context.pages],
"active_page_index": np.asarray([self.context.pages.index(self.page)]),
"url": self.page.url,
"url": self.page.url, # redundant with "open_pages_urls" and "active_page_index"
"screenshot": extract_screenshot(self.page),
"dom_object": dom,
"axtree_object": axtree,
Expand Down
22 changes: 17 additions & 5 deletions browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import importlib.resources
import json
import logging
import playwright.sync_api
import importlib.resources
import pathlib
import tempfile
import requests

import urllib.parse
from typing import Optional, Tuple

import playwright.sync_api
import requests

from browsergym.core.task import AbstractBrowserTask

from .instance import VisualWebArenaInstance
Expand Down Expand Up @@ -226,7 +227,18 @@ def teardown(self) -> None:
def validate(
self, page: playwright.sync_api.Page, chat_messages: list[str]
) -> Tuple[float, bool, str, dict]:
# import webarena on instanciation

# safeguard: check that all open tabs are either blank or within the list of WebArena URLs
authorized_locations = ["newtab", ""] + [
urllib.parse.urlparse(url).netloc
for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
]
for open_page in page.context.pages:
page_location = urllib.parse.urlparse(open_page.url).netloc
if not page_location in authorized_locations:
return 0, True, "", {"error": "Unauthorized url, terminating task"}

# import webarena dynamically
from visualwebarena.browser_env.actions import ActionTypes

# if any, use the last assistant message as the stop answer for webarena
Expand Down
22 changes: 17 additions & 5 deletions browsergym/webarena/src/browsergym/webarena/task.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import importlib.resources
import json
import logging
import numpy as np
import playwright.sync_api
import importlib.resources
import tempfile

import urllib.parse
from typing import Optional, Tuple

import numpy as np
import playwright.sync_api

from browsergym.core.task import AbstractBrowserTask

from .instance import WebArenaInstance
Expand Down Expand Up @@ -154,7 +155,18 @@ def teardown(self) -> None:
def validate(
self, page: playwright.sync_api.Page, chat_messages: list[str]
) -> Tuple[float, bool, str, dict]:
# import webarena on instanciation

# safeguard: check that all open tabs are either blank or within the list of WebArena URLs
authorized_locations = ["newtab", ""] + [
urllib.parse.urlparse(url).netloc
for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
]
for open_page in page.context.pages:
page_location = urllib.parse.urlparse(open_page.url).netloc
if not page_location in authorized_locations:
return 0, True, "", {"error": "Unauthorized url, terminating task"}

# import webarena dynamically
from webarena.browser_env.actions import ActionTypes

# if any, use the last assistant message as the stop answer for webarena
Expand Down
40 changes: 33 additions & 7 deletions demo_agent/basic_agent.py → demo_agent/agent.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import base64
import dataclasses
import numpy as np
import io
import logging

import numpy as np
import openai
from PIL import Image

from browsergym.experiments import Agent, AbstractAgentArgs
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
from browsergym.experiments import AbstractAgentArgs, Agent
from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -40,6 +41,9 @@ def obs_preprocessor(self, obs: dict) -> dict:
"goal_object": obs["goal_object"],
"last_action": obs["last_action"],
"last_action_error": obs["last_action_error"],
"open_pages_urls": obs["open_pages_urls"],
"open_pages_titles": obs["open_pages_titles"],
"active_page_index": obs["active_page_index"],
"axtree_txt": flatten_axtree_to_str(obs["axtree_object"]),
"pruned_html": prune_html(flatten_dom_to_str(obs["dom_object"])),
}
Expand All @@ -63,12 +67,10 @@ def __init__(
if not (use_html or use_axtree):
raise ValueError(f"Either use_html or use_axtree must be set to True.")

from openai import OpenAI

self.openai_client = OpenAI()
self.openai_client = openai.OpenAI()

self.action_set = HighLevelActionSet(
subsets=["chat", "bid", "infeas"], # define a subset of the action space
subsets=["chat", "tab", "nav", "bid", "infeas"], # define a subset of the action space
# subsets=["chat", "bid", "coord", "infeas"] # allow the agent to also use x,y coordinates
strict=False, # less strict on the parsing of the actions
multiaction=False, # does not enable the agent to take multiple actions at once
Expand Down Expand Up @@ -151,6 +153,29 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
# goal_object is directly presented as a list of openai-style messages
user_msgs.extend(obs["goal_object"])

# append url of all open tabs
user_msgs.append(
{
"type": "text",
"text": f"""\
# Currently open tabs
""",
}
)
for page_index, (page_url, page_title) in enumerate(
zip(obs["open_pages_urls"], obs["open_pages_titles"])
):
user_msgs.append(
{
"type": "text",
"text": f"""\
Tab {page_index}{" (active tab)" if page_index == obs["active_page_index"] else ""}
Title: {page_title}
URL: {page_url}
""",
}
)

# append page AXTree (if asked)
if self.use_axtree:
user_msgs.append(
Expand Down Expand Up @@ -234,6 +259,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
{
"type": "text",
"text": f"""\

{action}
""",
}
Expand Down Expand Up @@ -261,7 +287,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
"text": f"""\
# Next action

You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, the current state of the page before deciding on your next action.
You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, and the current state of the page before deciding on your next action.
""",
}
)
Expand Down
16 changes: 2 additions & 14 deletions demo_agent/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,2 @@
browsergym-core>=0.3
browsergym-experiments>=0.3
openai>=1.35.4,<1.36
langchain>=0.2,<0.3
langchain_openai>=0.1.10,<0.2
tiktoken
huggingface_hub
contexttimer
ipython
pyyaml>=6
pandas
joblib
transformers
langchain_community>=0.2.6,<0.3
browsergym
openai
6 changes: 3 additions & 3 deletions demo_agent/run_demo.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import argparse

# locally defined agent
from agent import DemoAgentArgs

# browsergym experiments utils
from browsergym.experiments import EnvArgs, ExpArgs, get_exp_result

# locally defined agent
from basic_agent import DemoAgentArgs


def str2bool(v):
if isinstance(v, bool):
Expand Down
23 changes: 23 additions & 0 deletions sandbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from dataclasses import dataclass

from dataclasses_json import DataClassJsonMixin


@dataclass
class Test(DataClassJsonMixin):
a: int
b: str

def do_something(self):
print(self.a, self.b)


x: Test = Test(0, "hello")

x_json = x.to_json()

print(x_json)

y = Test.from_json(x_json)

y.do_something()
22 changes: 22 additions & 0 deletions sandbox2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from browsergym.workarena import (
AGENT_CURRICULUM_L2,
AGENT_CURRICULUM_L3,
TASK_CATEGORY_MAP,
)

metadata = []

for task_name, category in TASK_CATEGORY_MAP.items():
metadata.append((task_name, "l1", category))

for category, items in AGENT_CURRICULUM_L2.items():
for task_set in items["buckets"]:
for task in task_set:
metadata.append((task.get_task_id(), "l2", category))

for category, items in AGENT_CURRICULUM_L3.items():
for task_set in items["buckets"]:
for task in task_set:
metadata.append((task.get_task_id(), "l3", category))

print("\n".join([",".join(x) for x in metadata]))
Empty file added sandbox3.py
Empty file.
Loading