From 00429a74b3a3c3d6272f7968b7caa0c48833a63a Mon Sep 17 00:00:00 2001 From: Maxime Gasse Date: Thu, 14 Nov 2024 10:03:44 -0500 Subject: [PATCH] VWA / WA updates (#254) --- .../browsergym/experiments/benchmark/utils.py | 56 ++++++++++++++----- .../src/browsergym/visualwebarena/instance.py | 6 ++ .../src/browsergym/visualwebarena/task.py | 4 +- .../src/browsergym/webarena/instance.py | 6 ++ .../webarena/src/browsergym/webarena/task.py | 4 +- 5 files changed, 58 insertions(+), 18 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index fcf4ca05..c64ae843 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -129,28 +129,56 @@ def prepare_backend(backend: str): vwa_massage_task_ids = [ 0, # classifieds 33, # classifieds - 150, # classifieds - 253, # reddit - 325, # reddit - 390, # reddit - 410, # shopping 555, # shopping 666, # shopping + 282, # __REDDIT__/f/dataisbeautiful + 305, # __REDDIT__/f/memes/new + 314, # __REDDIT__/f/mildlyinteresting + 317, # __REDDIT__/f/Art/active + 318, # __REDDIT__/f/consoles + 319, # __REDDIT__/f/EarthPorn + 410, # __REDDIT__/f/food + 411, # __REDDIT__/f/food + 427, # __REDDIT__/f/EarthPorn + 436, # __REDDIT__/f/Art + 440, # __REDDIT__/f/EarthPorn ] + vwa_massage_max_retries = 1 for i, task_id in enumerate(vwa_massage_task_ids): gym_id = f"browsergym/visualwebarena.{task_id}" logger.info( f"VisualWebArena instance massaging {i + 1} / {len(vwa_massage_task_ids)} ({gym_id} reset)" ) - env = gym.make(gym_id) - try: - env.reset() # task setup and logging - except Exception as e: - logger.warning( - f"Error during VisualWebArena instance massaging ({gym_id} reset): {e}" - ) - finally: - env.close() + retries = 0 + while True: + env = gym.make(gym_id) + try: + env.reset() # task setup + no_action = "noop()" + try: + # check if action space exists and is compatible with "noop()" + env.unwrapped.action_mapping(no_action) + except: + # fallback plan + no_action = "" + env.step(no_action) # task validation + env.step(no_action) # task validation again + logger.info(f"Massage successful") + break + except Exception as e: + if retries < vwa_massage_max_retries: + retries += 1 + logger.info( + f"Massage failed, retrying ({retries} / {vwa_massage_max_retries})" + ) + continue + else: + logger.warning( + f"Error during VisualWebArena instance massaging ({gym_id}, {retries} retries): {e}" + ) + break + finally: + env.close() case "workarena": # register environments diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/instance.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/instance.py index 12a39918..63936e8a 100644 --- a/browsergym/visualwebarena/src/browsergym/visualwebarena/instance.py +++ b/browsergym/visualwebarena/src/browsergym/visualwebarena/instance.py @@ -64,6 +64,9 @@ def ui_login(self, site: str, page: playwright.sync_api.Page): url = self.urls[site] + # open a new page (tab) to perform the login + page = page.context.new_page() + match site: case "reddit": username = self.credentials[site]["username"] @@ -95,3 +98,6 @@ def ui_login(self, site: str, page: playwright.sync_api.Page): case _: raise ValueError + + # release login page + page.close() diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py index 61486409..77c0dd40 100644 --- a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py +++ b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py @@ -286,8 +286,8 @@ def validate( page=page, # none of webarena's evaluators requires a cdp session ) # llm_fuzzy_match() bugfix (assert "correct" in response) - except AssertionError as e: - logger.info( + except AssertionError: + logger.debug( "llm_fuzzy_match() bugfix applied: AssertionError in evaluator, using score = 0.0" ) score = 0.0 diff --git a/browsergym/webarena/src/browsergym/webarena/instance.py b/browsergym/webarena/src/browsergym/webarena/instance.py index 812dcf66..b343ebf7 100644 --- a/browsergym/webarena/src/browsergym/webarena/instance.py +++ b/browsergym/webarena/src/browsergym/webarena/instance.py @@ -147,6 +147,9 @@ def ui_login(self, site: str, page: playwright.sync_api.Page): url = self.urls[site] + # open a new page (tab) to perform the login + page = page.context.new_page() + match site: case "reddit": username = self.credentials[site]["username"] @@ -193,3 +196,6 @@ def ui_login(self, site: str, page: playwright.sync_api.Page): case _: raise ValueError + + # release login page + page.close() diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py index 90b63a83..3467c152 100644 --- a/browsergym/webarena/src/browsergym/webarena/task.py +++ b/browsergym/webarena/src/browsergym/webarena/task.py @@ -191,8 +191,8 @@ def validate( client=None, # none of webarena's evaluators requires a cdp session ) # llm_fuzzy_match() bugfix (assert "correct" in response) - except AssertionError as e: - logger.info( + except AssertionError: + logger.debug( "llm_fuzzy_match() bugfix applied: AssertionError in evaluator, using score = 0.0" ) score = 0.0