From 34c74f82beecc1d22f14e5fd73f58a325a74cd67 Mon Sep 17 00:00:00 2001 From: lawyzheng Date: Fri, 15 Nov 2024 13:57:22 +0800 Subject: [PATCH] introduce complete verification --- skyvern/forge/agent.py | 69 ++++++++++--------- .../forge/prompts/skyvern/check-user-goal.j2 | 7 ++ skyvern/webeye/actions/actions.py | 10 +++ skyvern/webeye/actions/handler.py | 41 ++++++++++- 4 files changed, 91 insertions(+), 36 deletions(-) diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 059d1cdf6..a5a393dd9 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -49,6 +49,7 @@ Action, ActionType, CompleteAction, + CompleteVerifyResult, DecisiveAction, UserDefinedError, WebAction, @@ -923,57 +924,59 @@ async def agent_step( ) return failed_step, detailed_agent_step_output.get_clean_detailed_output() + @staticmethod + async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult: + LOG.info( + "Checking if user goal is achieved after re-scraping the page", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + scraped_page_refreshed = await scraped_page.refresh() + + # TODO: currently, just using the check user goal for complete verification + # maybe need a desinged complete criterion in the future + verification_prompt = prompt_engine.load_prompt( + "check-user-goal", + navigation_goal=task.navigation_goal, + navigation_payload=task.navigation_payload, + elements=scraped_page_refreshed.build_element_tree(ElementTreeFormat.HTML), + ) + + # this prompt is critical to our agent so let's use the primary LLM API handler + verification_result = await app.LLM_API_HANDLER( + prompt=verification_prompt, step=step, screenshots=scraped_page_refreshed.screenshots + ) + return CompleteVerifyResult.model_validate(verification_result) + @staticmethod async def check_user_goal_complete( page: Page, scraped_page: ScrapedPage, task: Task, step: Step ) -> CompleteAction | None: try: - LOG.info( - "Checking if user goal is achieved after re-scraping the page without screenshots", - task_id=task.task_id, - step_id=step.step_id, - workflow_run_id=task.workflow_run_id, - ) - scraped_page_refreshed = await scraped_page.refresh() - - verification_prompt = prompt_engine.load_prompt( - "check-user-goal", - navigation_goal=task.navigation_goal, - navigation_payload=task.navigation_payload, - elements=scraped_page_refreshed.build_element_tree(ElementTreeFormat.HTML), + verification_result = await app.agent.complete_verify( + page=page, + scraped_page=scraped_page, + task=task, + step=step, ) - # this prompt is critical to our agent so let's use the primary LLM API handler - verification_response = await app.LLM_API_HANDLER( - prompt=verification_prompt, step=step, screenshots=scraped_page_refreshed.screenshots - ) - if "user_goal_achieved" not in verification_response or "thoughts" not in verification_response: - LOG.error( - "Invalid LLM response for user goal success verification, skipping verification", - verification_response=verification_response, - task_id=task.task_id, - step_id=step.step_id, - workflow_run_id=task.workflow_run_id, - ) - return None - - user_goal_achieved: bool = verification_response["user_goal_achieved"] # We don't want to return a complete action if the user goal is not achieved since we're checking at every step - if not user_goal_achieved: + if not verification_result.user_goal_achieved: return None return CompleteAction( - reasoning=verification_response["thoughts"], + reasoning=verification_result.thoughts, data_extraction_goal=task.data_extraction_goal, + verified=True, ) except Exception: - LOG.error( - "LLM verification failed for complete action, skipping LLM verification", + LOG.exception( + "Failed to check user goal complete, skipping", task_id=task.task_id, step_id=step.step_id, workflow_run_id=task.workflow_run_id, - exc_info=True, ) return None diff --git a/skyvern/forge/prompts/skyvern/check-user-goal.j2 b/skyvern/forge/prompts/skyvern/check-user-goal.j2 index b5af1d719..e850f2a6d 100644 --- a/skyvern/forge/prompts/skyvern/check-user-goal.j2 +++ b/skyvern/forge/prompts/skyvern/check-user-goal.j2 @@ -7,12 +7,19 @@ Make sure to ONLY return the JSON object in this format with no additional text "thoughts": str, // Think step by step. What information makes you believe whether user goal has completed or not. Use information you see on the site to explain. "user_goal_achieved": bool // True if the user goal has been completed, false otherwise. } +``` Elements on the page: +``` {{ elements }} +``` User Goal: +``` {{ navigation_goal }} +``` User Details: +``` {{ navigation_payload }} +``` \ No newline at end of file diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 0e23c0cc4..c63431c1e 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -65,6 +65,15 @@ def __repr__(self) -> str: return f"SelectOption(label={self.label}, value={self.value}, index={self.index})" +class CompleteVerifyResult(BaseModel): + user_goal_achieved: bool + thoughts: str + page_info: str | None = None + + def __repr__(self) -> str: + return f"CompleteVerifyResponse(thoughts={self.thoughts}, user_goal_achieved={self.user_goal_achieved}, page_info={self.page_info})" + + class InputOrSelectContext(BaseModel): field: str | None = None is_required: bool | None = None @@ -226,6 +235,7 @@ class TerminateAction(DecisiveAction): class CompleteAction(DecisiveAction): action_type: ActionType = ActionType.COMPLETE + verified: bool = False data_extraction_goal: str | None = None diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index e6b890295..1cb29581f 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1032,9 +1032,14 @@ async def handle_complete_action( ) -> list[ActionResult]: # If this action has a source_action_id, then we need to make sure if the goal is actually completed. if action.source_action_id: - LOG.info("CompleteAction has source_action_id, checking if goal is completed") - complete_action = await app.agent.check_user_goal_complete(page, scraped_page, task, step) - if complete_action is None: + LOG.info( + "CompleteAction has source_action_id, checking if goal is completed", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + verified_complete_action = await app.agent.check_user_goal_complete(page, scraped_page, task, step) + if verified_complete_action is None: return [ ActionFailure( exception=IllegitComplete( @@ -1044,6 +1049,36 @@ async def handle_complete_action( ) ) ] + action.verified = True + + if not action.verified: + LOG.info( + "CompleteAction hasn't been verified, going to verify the user goal", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + try: + verification_result = await app.agent.complete_verify(page, scraped_page, task, step) + except Exception as e: + LOG.exception( + "Failed to verify the complete action", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + return [ActionFailure(exception=e)] + + if not verification_result.user_goal_achieved: + return [ActionFailure(exception=IllegitComplete(data={"error": verification_result.thoughts}))] + + LOG.info( + "CompleteAction has been verified successfully", + task_id=task.task_id, + step_id=step.step_id, + workflow_run_id=task.workflow_run_id, + ) + action.verified = True extracted_data = None if action.data_extraction_goal: