diff --git a/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 b/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 index 9b6d028ab4..b9ff8f2736 100644 --- a/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 +++ b/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 @@ -1,6 +1,7 @@ -There is an input element on an HTML page. Based on the context and information provided, you have two goals: +There is an input element on an HTML page. Based on the context and information provided, you have {{ "three" if is_search else "two" }} goals: - Confirm if an auto-completion attempt appears after the user inputs the current value. - - If auto-completion suggestions appear, assist the user in selecting the most appropriate element based on the user's goal, details, and the context. + - If auto-completion suggestions appear, assist the user in selecting the most appropriate element based on the user's goal, details, and the context.{% if is_search %} + - Confirm if direct searching is a better way compared to all suggestions based on user's goal.{% endif %} You can confirm an auto-completion attempt based on the following rules: - Several auto-completion suggestions appear for the input value. @@ -15,6 +16,8 @@ Each interactable element is tagged with an ID. Reply in JSON format with the following keys: { + "thought": str, // Think step by step. Describe your thought about how you achieve the {{ "three" if is_search else "two" }} goals with convincing evidence.{% if is_search %} + "direct_searching": bool, // True if direct searching is a better way compared to all suggestions, otherwise False.{% endif %} "auto_completion_attempt": bool, // True if there's any auto completion attempt based on the rules. Otherwise, it should be False. "reasoning": str, // The reasoning behind the decision. Be specific, referencing the value and the element id in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. @@ -25,7 +28,7 @@ Reply in JSON format with the following keys: Context: ``` -Choose an auto-completion suggestion for "{{ field_information }}" +Choose an auto-completion suggestion for "{{ field_information }}"{%if is_search %} or directly search with the input value{% endif %} ``` Input value: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 1bd6500fdb..e973eba168 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -788,12 +788,21 @@ async def handle_input_text_action( await incremental_scraped.stop_listen_dom_increment() return [ActionSuccess()] + except Exception as e: + LOG.exception( + "Failed to input the value or finish the auto completion", + task_id=task.task_id, + step_id=step.step_id, + ) + raise e finally: # HACK: force to finish missing auto completion input - if auto_complete_hacky_flag and not await skyvern_element.is_raw_input(): + if auto_complete_hacky_flag and await skyvern_element.is_visible() and not await skyvern_element.is_raw_input(): LOG.debug( "Trigger input-selection hack, pressing Tab to choose one", action=action, + task_id=task.task_id, + step_id=step.step_id, ) await skyvern_element.press_key("Tab") @@ -1624,6 +1633,7 @@ async def choose_auto_completion_dropdown( html = incremental_scraped.build_html_tree(cleaned_incremental_element) auto_completion_confirm_prompt = prompt_engine.load_prompt( "auto-completion-choose-option", + is_search=context.is_search_bar, field_information=context.field, filled_value=text, navigation_goal=task.navigation_goal, @@ -1638,6 +1648,16 @@ async def choose_auto_completion_dropdown( json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=auto_completion_confirm_prompt, step=step) element_id = json_response.get("id", "") relevance_float = json_response.get("relevance_float", 0) + if json_response.get("direct_searching", False): + LOG.info( + "Decided to directly search with the current value", + value=text, + step_id=step.step_id, + task_id=task.task_id, + ) + await skyvern_element.press_key("Enter") + return result + if not element_id: reasoning = json_response.get("reasoning") raise NoSuitableAutoCompleteOption(reasoning=reasoning, target_value=text) @@ -1682,7 +1702,7 @@ async def choose_auto_completion_dropdown( return result finally: await incremental_scraped.stop_listen_dom_increment() - if clear_input: + if clear_input and await skyvern_element.is_visible(): await skyvern_element.input_clear() diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 5645ba0399..2a1a5febd5 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -2132,15 +2132,26 @@ if (window.globalObserverForDOMIncrement === undefined) { } if (mutation.type === "childList") { + if (mutation.target.nodeType === Node.TEXT_NODE) continue; + const node = mutation.target; let changedNode = { - targetNode: mutation.target, // TODO: for future usage, when we want to parse new elements into a tree + targetNode: node, // TODO: for future usage, when we want to parse new elements into a tree }; let newNodes = []; - if (mutation.addedNodes && mutation.addedNodes.length > 0) { - for (const node of mutation.addedNodes) { - // skip the text nodes, they won't be interactable - if (node.nodeType === Node.TEXT_NODE) continue; - newNodes.push(node); + if ( + node.tagName.toLowerCase() === "ul" || + (node.tagName.toLowerCase() === "div" && + node.hasAttribute("role") && + node.getAttribute("role").toLowerCase() === "listbox") + ) { + newNodes.push(node); + } else { + if (mutation.addedNodes && mutation.addedNodes.length > 0) { + for (const node of mutation.addedNodes) { + // skip the text nodes, they won't be interactable + if (node.nodeType === Node.TEXT_NODE) continue; + newNodes.push(node); + } } } if (newNodes.length > 0) { diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index ce72d8b45e..6c6fe72b60 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -575,6 +575,10 @@ async def start_listen_dom_increment(self) -> None: await SkyvernFrame.evaluate(frame=self.skyvern_frame.get_frame(), expression=js_script) async def stop_listen_dom_increment(self) -> None: + # check if the DOM has navigated away or refreshed + js_script = "() => window.globalObserverForDOMIncrement === undefined" + if await SkyvernFrame.evaluate(frame=self.skyvern_frame.get_frame(), expression=js_script): + return js_script = "() => stopGlobalIncrementalObserver()" await SkyvernFrame.evaluate(frame=self.skyvern_frame.get_frame(), expression=js_script) diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 84f74215ee..2dc0838c96 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -267,6 +267,8 @@ async def is_selectable(self) -> bool: return self.get_selectable() or self.get_tag_name() in SELECTABLE_ELEMENT async def is_visible(self) -> bool: + if not await self.get_locator().count(): + return False skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) return await skyvern_frame.get_element_visible(await self.get_element_handler())