Skip to content

Commit

Permalink
VWA / WA updates (#254)
Browse files Browse the repository at this point in the history
  • Loading branch information
gasse authored Nov 14, 2024
1 parent 3a30826 commit 00429a7
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -129,28 +129,56 @@ def prepare_backend(backend: str):
vwa_massage_task_ids = [
0, # classifieds
33, # classifieds
150, # classifieds
253, # reddit
325, # reddit
390, # reddit
410, # shopping
555, # shopping
666, # shopping
282, # __REDDIT__/f/dataisbeautiful
305, # __REDDIT__/f/memes/new
314, # __REDDIT__/f/mildlyinteresting
317, # __REDDIT__/f/Art/active
318, # __REDDIT__/f/consoles
319, # __REDDIT__/f/EarthPorn
410, # __REDDIT__/f/food
411, # __REDDIT__/f/food
427, # __REDDIT__/f/EarthPorn
436, # __REDDIT__/f/Art
440, # __REDDIT__/f/EarthPorn
]
vwa_massage_max_retries = 1
for i, task_id in enumerate(vwa_massage_task_ids):
gym_id = f"browsergym/visualwebarena.{task_id}"
logger.info(
f"VisualWebArena instance massaging {i + 1} / {len(vwa_massage_task_ids)} ({gym_id} reset)"
)
env = gym.make(gym_id)
try:
env.reset() # task setup and logging
except Exception as e:
logger.warning(
f"Error during VisualWebArena instance massaging ({gym_id} reset): {e}"
)
finally:
env.close()
retries = 0
while True:
env = gym.make(gym_id)
try:
env.reset() # task setup
no_action = "noop()"
try:
# check if action space exists and is compatible with "noop()"
env.unwrapped.action_mapping(no_action)
except:
# fallback plan
no_action = ""
env.step(no_action) # task validation
env.step(no_action) # task validation again
logger.info(f"Massage successful")
break
except Exception as e:
if retries < vwa_massage_max_retries:
retries += 1
logger.info(
f"Massage failed, retrying ({retries} / {vwa_massage_max_retries})"
)
continue
else:
logger.warning(
f"Error during VisualWebArena instance massaging ({gym_id}, {retries} retries): {e}"
)
break
finally:
env.close()

case "workarena":
# register environments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def ui_login(self, site: str, page: playwright.sync_api.Page):

url = self.urls[site]

# open a new page (tab) to perform the login
page = page.context.new_page()

match site:
case "reddit":
username = self.credentials[site]["username"]
Expand Down Expand Up @@ -95,3 +98,6 @@ def ui_login(self, site: str, page: playwright.sync_api.Page):

case _:
raise ValueError

# release login page
page.close()
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,8 @@ def validate(
page=page, # none of webarena's evaluators requires a cdp session
)
# llm_fuzzy_match() bugfix (assert "correct" in response)
except AssertionError as e:
logger.info(
except AssertionError:
logger.debug(
"llm_fuzzy_match() bugfix applied: AssertionError in evaluator, using score = 0.0"
)
score = 0.0
Expand Down
6 changes: 6 additions & 0 deletions browsergym/webarena/src/browsergym/webarena/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ def ui_login(self, site: str, page: playwright.sync_api.Page):

url = self.urls[site]

# open a new page (tab) to perform the login
page = page.context.new_page()

match site:
case "reddit":
username = self.credentials[site]["username"]
Expand Down Expand Up @@ -193,3 +196,6 @@ def ui_login(self, site: str, page: playwright.sync_api.Page):

case _:
raise ValueError

# release login page
page.close()
4 changes: 2 additions & 2 deletions browsergym/webarena/src/browsergym/webarena/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def validate(
client=None, # none of webarena's evaluators requires a cdp session
)
# llm_fuzzy_match() bugfix (assert "correct" in response)
except AssertionError as e:
logger.info(
except AssertionError:
logger.debug(
"llm_fuzzy_match() bugfix applied: AssertionError in evaluator, using score = 0.0"
)
score = 0.0
Expand Down

0 comments on commit 00429a7

Please sign in to comment.