diff --git a/README.md b/README.md index b45863c7..60cbad65 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- UFO UFO Image: A UI-Focused Agent for Windows OS Interaction + UFO UFO Image: A UI-Focused Agent for Windows OS Interaction

@@ -35,6 +35,7 @@ Both agents leverage the multi-modal capabilities of GPT-Vision to comprehend th ## πŸ“’ News +- πŸ“… 2024-09-08: We have a **New Release for v1.1.0!**, to allows UFO to click on any region of the application and reduces its latency by up tp 1/3! - πŸ“… 2024-07-06: We have a **New Release for v1.0.0!**. You can check out our [documentation](https://microsoft.github.io/UFO/). We welcome your contributions and feedback! - πŸ“… 2024-06-28: We are thrilled to announce that our official introduction video is now available on [YouTube](https://www.youtube.com/watch?v=QT_OhygMVXU)! - πŸ“… 2024-06-25: **New Release for v0.2.1!** We are excited to announce the release of version 0.2.1! This update includes several new features and improvements: @@ -52,7 +53,7 @@ Both agents leverage the multi-modal capabilities of GPT-Vision to comprehend th 1. We now support creating your help documents for each Windows application to become an app expert. Check the [documentation](https://microsoft.github.io/UFO/creating_app_agent/help_document_provision/) for more details! 2. UFO now supports RAG from offline documents and online Bing search. 3. You can save the task completion trajectory into its memory for UFO's reference, improving its future success rate! - 4. You can customize different GPT models for AppAgent and ActAgent. Text-only models (e.g., GPT-4) are now supported! + 4. You can customize different GPT models for HostAgent and AppAgent. Text-only models (e.g., GPT-4) are now supported! - πŸ“… 2024-02-14: Our [technical report](https://arxiv.org/abs/2402.07939) is online! - πŸ“… 2024-02-10: UFO is released on GitHub🎈. Happy Chinese New yearπŸ‰! diff --git a/documents/docs/automator/ui_automator.md b/documents/docs/automator/ui_automator.md index 87a05135..625707cb 100644 --- a/documents/docs/automator/ui_automator.md +++ b/documents/docs/automator/ui_automator.md @@ -65,10 +65,15 @@ Below is the list of available commands in the UI Automator that are currently s | Command Name | Function Name | Description | |--------------|---------------|-------------| | `ClickInputCommand` | `click_input` | Click the control item with the mouse. | +| `ClickOnCoordinatesCommand` | `click_on_coordinates` | Click on the specific fractional coordinates of the application window. | +| `DragOnCoordinatesCommand` | `drag_on_coordinates` | Drag the mouse on the specific fractional coordinates of the application window. | | `SetEditTextCommand` | `set_edit_text` | Add new text to the control item. | | `GetTextsCommand` | `texts` | Get the text of the control item. | | `WheelMouseInputCommand` | `wheel_mouse_input` | Scroll the control item. | | `KeyboardInputCommand` | `keyboard_input` | Simulate the keyboard input. | +!!! tip + Please refer to the `ufo/prompts/share/base/api.yaml` file for the detailed API documentation of the UI Automator. + !!! tip You can customize the commands by adding new command classes to the `ufo/automator/ui_control/controller/ControlCommand` module. diff --git a/documents/docs/automator/web_automator.md b/documents/docs/automator/web_automator.md index 76229a00..ede352ed 100644 --- a/documents/docs/automator/web_automator.md +++ b/documents/docs/automator/web_automator.md @@ -55,4 +55,8 @@ Below is the list of available commands in the Web Automator that are currently | Command Name | Function Name | Description | |--------------|---------------|-------------| -| `WebCrawlerCommand` | `web_crawler` | Get the content of a web page into a markdown format. | \ No newline at end of file +| `WebCrawlerCommand` | `web_crawler` | Get the content of a web page into a markdown format. | + + +!!! tip + Please refer to the `ufo/prompts/apps/web/api.yaml` file for the prompt details for the `WebCrawlerCommand` command. \ No newline at end of file diff --git a/documents/docs/automator/wincom_automator.md b/documents/docs/automator/wincom_automator.md index 90592f11..59b14ae8 100644 --- a/documents/docs/automator/wincom_automator.md +++ b/documents/docs/automator/wincom_automator.md @@ -75,6 +75,9 @@ Below is the list of available commands in the API Automator that are currently | `InsertExcelTableCommand` | `insert_excel_table` | Insert a table to the Excel sheet. | +!!! tip + Please refer to the `ufo/prompts/apps/{app_name}/api.yaml` file for the prompt details for the commands. + !!! tip You can customize the commands by adding new command classes to the `ufo/automator/app_apis/{app_name}/` module. diff --git a/learner/indexer.py b/learner/indexer.py index 2d92b9da..4a61ff88 100644 --- a/learner/indexer.py +++ b/learner/indexer.py @@ -1,9 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from ufo.utils import get_hugginface_embedding from . import xml_loader from .utils import load_json_file, save_json_file, print_with_color -from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS import os @@ -39,9 +39,7 @@ def create_indexer(app: str, docs: str, format: str, incremental: bool, save_pat ) if format == "xml": - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) + embeddings = get_hugginface_embedding() else: raise ValueError("Invalid format: " + format) diff --git a/record_processor/summarizer/summarizer.py b/record_processor/summarizer/summarizer.py index deb212c6..8683ce6b 100644 --- a/record_processor/summarizer/summarizer.py +++ b/record_processor/summarizer/summarizer.py @@ -6,7 +6,6 @@ import yaml from langchain.docstore.document import Document -from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from record_processor.parser.demonstration_record import DemonstrationRecord @@ -183,14 +182,11 @@ def create_or_update_vector_db(summaries: list, db_path: str): request = summary["request"] document_list.append(Document(page_content=request, metadata=summary)) - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.from_documents(document_list, embeddings) + db = FAISS.from_documents(document_list, get_hugginface_embedding()) # Check if the db exists, if not, create a new one. if os.path.exists(db_path): - prev_db = FAISS.load_local(db_path, embeddings) + prev_db = FAISS.load_local(db_path, get_hugginface_embedding()) db.merge_from(prev_db) db.save_local(db_path) diff --git a/ufo/agents/agent/host_agent.py b/ufo/agents/agent/host_agent.py index 7becab2e..e4af19de 100644 --- a/ufo/agents/agent/host_agent.py +++ b/ufo/agents/agent/host_agent.py @@ -261,7 +261,7 @@ def print_response(self, response_dict: Dict) -> None: subtask = response_dict.get("CurrentSubtask") # Convert the message from a list to a string. - message = list(response_dict.get("Message")) + message = list(response_dict.get("Message", "")) message = "\n".join(message) # Concatenate the subtask with the plan and convert the plan from a list to a string. diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py index a146c899..d513c9ce 100644 --- a/ufo/agents/processors/app_agent_processor.py +++ b/ufo/agents/processors/app_agent_processor.py @@ -11,10 +11,12 @@ from ufo import utils from ufo.agents.processors.basic import BaseProcessor +from ufo.automator.ui_control.screenshot import PhotographerDecorator from ufo.automator.ui_control.control_filter import ControlFilterFactory from ufo.config.config import Config from ufo.module.context import Context, ContextNames + if TYPE_CHECKING: from ufo.agents.agent.app_agent import AppAgent @@ -77,6 +79,7 @@ def print_step_info(self) -> None: "magenta", ) + @BaseProcessor.method_timer def capture_screenshot(self) -> None: """ Capture the screenshot. @@ -171,6 +174,7 @@ def capture_screenshot(self) -> None: self._save_to_xml() + @BaseProcessor.method_timer def get_control_info(self) -> None: """ Get the control information. @@ -191,6 +195,7 @@ def get_control_info(self) -> None: ) ) + @BaseProcessor.method_timer def get_prompt_message(self) -> None: """ Get the prompt message for the AppAgent. @@ -232,6 +237,7 @@ def get_prompt_message(self) -> None: ) self.request_logger.debug(log) + @BaseProcessor.method_timer def get_response(self) -> None: """ Get the response from the LLM. @@ -247,6 +253,7 @@ def get_response(self) -> None: self.llm_error_handler() return + @BaseProcessor.method_timer def parse_response(self) -> None: """ Parse the response. @@ -277,38 +284,59 @@ def parse_response(self) -> None: self.status = self._response_json.get("Status", "") self.app_agent.print_response(self._response_json) + @BaseProcessor.method_timer def execute_action(self) -> None: """ Execute the action. """ + control_selected = self._annotation_dict.get(self._control_label, "") + try: # Get the selected control item from the annotation dictionary and LLM response. # The LLM response is a number index corresponding to the key in the annotation dictionary. - control_selected = self._annotation_dict.get(self._control_label, "") if control_selected: - control_selected.draw_outline(colour="red", thickness=3) - time.sleep(configs.get("RECTANGLE_TIME", 0)) - self.app_agent.Puppeteer.receiver_manager.create_ui_control_receiver( - control_selected, self.application_window - ) + if configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True): + control_selected.draw_outline(colour="red", thickness=3) + time.sleep(configs.get("RECTANGLE_TIME", 0)) - # Save the screenshot of the tagged selected control. - self.capture_control_screenshot(control_selected) + control_coordinates = PhotographerDecorator.coordinate_adjusted( + self.application_window.rectangle(), control_selected.rectangle() + ) - if self.status.upper() == self._agent_status_manager.SCREENSHOT.value: - self.handle_screenshot_status() - else: - self._results = self.app_agent.Puppeteer.execute_command( - self._operation, self._args + self._control_log = { + "control_class": control_selected.element_info.class_name, + "control_type": control_selected.element_info.control_type, + "control_automation_id": control_selected.element_info.automation_id, + "control_friendly_class_name": control_selected.friendly_class_name(), + "control_coordinates": { + "left": control_coordinates[0], + "top": control_coordinates[1], + "right": control_coordinates[2], + "bottom": control_coordinates[3], + }, + } + + self.app_agent.Puppeteer.receiver_manager.create_ui_control_receiver( + control_selected, self.application_window ) - self.control_reannotate = None - if not utils.is_json_serializable(self._results): - self._results = "" - return + # Save the screenshot of the tagged selected control. + self.capture_control_screenshot(control_selected) + + if self.status.upper() == self._agent_status_manager.SCREENSHOT.value: + self.handle_screenshot_status() + else: + self._results = self.app_agent.Puppeteer.execute_command( + self._operation, self._args + ) + self.control_reannotate = None + if not utils.is_json_serializable(self._results): + self._results = "" + + return except Exception: self.general_error_handler() @@ -365,7 +393,7 @@ def update_memory(self) -> None: "Action": self.action, "ActionType": self.app_agent.Puppeteer.get_command_types(self._operation), "Request": self.request, - "Agent": "ActAgent", + "Agent": "AppAgent", "AgentName": self.app_agent.name, "Application": app_root, "Cost": self._cost, @@ -373,6 +401,8 @@ def update_memory(self) -> None: } self._memory_data.set_values_from_dict(self._response_json) self._memory_data.set_values_from_dict(additional_memory) + self._memory_data.set_values_from_dict(self._control_log) + self._memory_data.set_values_from_dict({"time_cost": self._time_cost}) if self.status.upper() == self._agent_status_manager.CONFIRM.value: self._memory_data.set_values_from_dict({"UserConfirm": "Yes"}) @@ -381,7 +411,7 @@ def update_memory(self) -> None: # Log the memory item. self.context.add_to_structural_logs(self._memory_data.to_dict()) - self.log(self._memory_data.to_dict()) + # self.log(self._memory_data.to_dict()) # Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps. memorized_action = { diff --git a/ufo/agents/processors/basic.py b/ufo/agents/processors/basic.py index da9800db..543e1821 100644 --- a/ufo/agents/processors/basic.py +++ b/ufo/agents/processors/basic.py @@ -2,11 +2,12 @@ # Licensed under the MIT License. +from functools import wraps import json import time import traceback from abc import ABC, abstractmethod -from typing import List +from typing import Any, List from pywinauto.controls.uiawrapper import UIAWrapper @@ -57,6 +58,15 @@ def __init__(self, agent: BasicAgent, context: Context) -> None: self._action = None self._plan = None + self._control_log = { + "control_class": None, + "control_type": None, + "control_automation_id": None, + } + + self._total_time_cost = 0 + self._time_cost = {} + def process(self) -> None: """ Process a single step in a round. @@ -71,9 +81,11 @@ def process(self) -> None: 8. Execute the action. 9. Update the memory. 10. Update the step and status. - 11. Update the step. + 11. Save the log. """ + start_time = time.time() + # Step 1: Print the step information. self.print_step_info() @@ -115,8 +127,10 @@ def process(self) -> None: # Step 10: Update the status. self.update_status() - # Step 11: Update the context. - self.update_step() + self._total_time_cost = time.time() - start_time + + # Step 11: Save the log. + self.log_save() def resume(self) -> None: """ @@ -139,6 +153,24 @@ def resume(self) -> None: self._is_resumed = False + @classmethod + def method_timer(cls, func): + """ + Decorator to calculate the time cost of the method. + :param func: The method to be decorated. + :return: The decorated method. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + start_time = time.time() + result = func(self, *args, **kwargs) + end_time = time.time() + self._time_cost[func.__name__] = end_time - start_time + return result + + return wrapper + @abstractmethod def print_step_info(self) -> None: """ @@ -205,6 +237,19 @@ def update_status(self) -> None: if self.status != self._agent_status_manager.FINISH.value: time.sleep(configs["SLEEP_TIME"]) + self.round_step += 1 + self.session_step += 1 + + def log_save(self) -> None: + """ + Save the log. + """ + + self._memory_data.set_values_from_dict( + {"total_time_cost": self._total_time_cost} + ) + self.log(self._memory_data.to_dict()) + @property def context(self) -> Context: """ @@ -221,14 +266,6 @@ def update_cost(self) -> None: self.round_cost += self.cost self.session_cost += self.cost - def update_step(self) -> None: - """ - Update the step. - """ - - self.round_step += 1 - self.session_step += 1 - @property def agent(self) -> BasicAgent: """ @@ -684,7 +721,7 @@ def llm_error_handler(self) -> None: return @staticmethod - def string2list(string: str) -> List[str]: + def string2list(string: Any) -> List[str]: """ Convert a string to a list of string if the input is a string. :param string: The string. diff --git a/ufo/agents/processors/host_agent_processor.py b/ufo/agents/processors/host_agent_processor.py index c2d06149..ab4257eb 100644 --- a/ufo/agents/processors/host_agent_processor.py +++ b/ufo/agents/processors/host_agent_processor.py @@ -51,6 +51,7 @@ def print_step_info(self) -> None: "magenta", ) + @BaseProcessor.method_timer def capture_screenshot(self) -> None: """ Capture the screenshot. @@ -70,6 +71,7 @@ def capture_screenshot(self) -> None: desktop_save_path ) + @BaseProcessor.method_timer def get_control_info(self) -> None: """ Get the control information. @@ -85,6 +87,7 @@ def get_control_info(self) -> None: self._desktop_windows_dict ) + @BaseProcessor.method_timer def get_prompt_message(self) -> None: """ Get the prompt message. @@ -111,6 +114,7 @@ def get_prompt_message(self) -> None: ) self.request_logger.debug(log) + @BaseProcessor.method_timer def get_response(self) -> None: """ Get the response from the LLM. @@ -125,6 +129,7 @@ def get_response(self) -> None: except Exception: self.llm_error_handler() + @BaseProcessor.method_timer def parse_response(self) -> None: """ Parse the response. @@ -153,6 +158,7 @@ def parse_response(self) -> None: self.host_agent.print_response(self._response_json) + @BaseProcessor.method_timer def execute_action(self) -> None: """ Execute the action. @@ -171,6 +177,12 @@ def execute_action(self) -> None: self.status = self._agent_status_manager.FINISH.value return + self._control_log = { + "control_class": new_app_window.element_info.class_name, + "control_type": new_app_window.element_info.control_type, + "control_automation_id": new_app_window.element_info.automation_id, + } + # Get the root name of the application. self.app_root = self.control_inspector.get_application_root_name(new_app_window) @@ -183,7 +195,8 @@ def execute_action(self) -> None: # Switch to the new application window, if it is different from the current application window. self.switch_to_new_app_window(new_app_window) self.application_window.set_focus() - self.application_window.draw_outline(colour="red", thickness=3) + if configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True): + self.application_window.draw_outline(colour="red", thickness=3) self.action = "set_focus()" @@ -235,7 +248,7 @@ def update_memory(self) -> None: "RoundStep": self.round_step, "AgentStep": self.host_agent.step, "Round": self.round_num, - "ControlLabel": self.control_text, + "ControlLabel": self.control_label, "SubtaskIndex": -1, "Action": self.action, "ActionType": "UIControl", @@ -249,11 +262,14 @@ def update_memory(self) -> None: self._memory_data.set_values_from_dict(self._response_json) self._memory_data.set_values_from_dict(additional_memory) + self._memory_data.set_values_from_dict(self._control_log) + self._memory_data.set_values_from_dict({"time_cost": self._time_cost}) + self.host_agent.add_memory(self._memory_data) # Log the memory item. self.context.add_to_structural_logs(self._memory_data.to_dict()) - self.log(self._memory_data.to_dict()) + # self.log(self._memory_data.to_dict()) # Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps. memorized_action = { diff --git a/ufo/automator/app_apis/excel/excelclient.py b/ufo/automator/app_apis/excel/excelclient.py index 476b1cec..386daf4c 100644 --- a/ufo/automator/app_apis/excel/excelclient.py +++ b/ufo/automator/app_apis/excel/excelclient.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from typing import Any, Dict, List, Type +from typing import Any, Dict, List, Type, Union import pandas as pd @@ -30,10 +30,10 @@ def get_object_from_process_name(self) -> None: return None - def table2markdown(self, sheet_name: str) -> str: + def table2markdown(self, sheet_name: Union[str, int]) -> str: """ Convert the table in the sheet to a markdown table string. - :param sheet_name: The sheet name. + :param sheet_name: The sheet name (str), or the sheet index (int), starting from 1. :return: The markdown table string. """ diff --git a/ufo/automator/app_apis/web/webclient.py b/ufo/automator/app_apis/web/webclient.py index 9a40333f..d6bb8571 100644 --- a/ufo/automator/app_apis/web/webclient.py +++ b/ufo/automator/app_apis/web/webclient.py @@ -51,7 +51,7 @@ def web_crawler(self, url: str, ignore_link: bool) -> str: except requests.RequestException as e: print(f"Error fetching the URL: {e}") - return f"Error fetching the URL: {e}" + return f"Error fetching the URL: {e}" @property def type_name(self): diff --git a/ufo/automator/puppeteer.py b/ufo/automator/puppeteer.py index d3163924..1486e92c 100644 --- a/ufo/automator/puppeteer.py +++ b/ufo/automator/puppeteer.py @@ -56,9 +56,14 @@ def get_command_types(self, command_name: str) -> str: :param command_name: The command name. :return: The command types. """ - receiver = self.receiver_manager.get_receiver_from_command_name(command_name) - return receiver.type_name + try: + receiver = self.receiver_manager.get_receiver_from_command_name( + command_name + ) + return receiver.type_name + except: + return "" def execute_command( self, command_name: str, params: Dict[str, Any], *args, **kwargs diff --git a/ufo/automator/ui_control/controller.py b/ufo/automator/ui_control/controller.py index 5a0223ea..eae14c98 100644 --- a/ufo/automator/ui_control/controller.py +++ b/ufo/automator/ui_control/controller.py @@ -4,9 +4,12 @@ import time import warnings from abc import abstractmethod -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union +import pyautogui +import pywinauto from pywinauto.controls.uiawrapper import UIAWrapper +from pywinauto.win32structures import RECT from ufo.automator.basic import CommandBasic, ReceiverBasic, ReceiverFactory from ufo.automator.puppeteer import ReceiverManager @@ -15,6 +18,10 @@ configs = Config.get_instance().config_data +if configs.get("AFTER_CLICK_WAIT", None) is not None: + pywinauto.timings.Timings.after_clickinput_wait = configs["AFTER_CLICK_WAIT"] + pywinauto.timings.Timings.after_click_wait = configs["AFTER_CLICK_WAIT"] + class ControlReceiver(ReceiverBasic): """ @@ -23,7 +30,9 @@ class ControlReceiver(ReceiverBasic): _command_registry: Dict[str, Type[CommandBasic]] = {} - def __init__(self, control: UIAWrapper, application: UIAWrapper): + def __init__( + self, control: Optional[UIAWrapper], application: Optional[UIAWrapper] + ) -> None: """ Initialize the control receiver. :param control: The control element. @@ -31,11 +40,13 @@ def __init__(self, control: UIAWrapper, application: UIAWrapper): """ self.control = control + self.application = application if control: self.control.set_focus() self.wait_enabled() - self.application = application + elif application: + self.application.set_focus() @property def type_name(self): @@ -49,6 +60,8 @@ def atomic_execution(self, method_name: str, params: Dict[str, Any]) -> str: :return: The result of the action. """ + import traceback + try: method = getattr(self.control, method_name) result = method(**params) @@ -57,7 +70,8 @@ def atomic_execution(self, method_name: str, params: Dict[str, Any]) -> str: print_with_color(f"Warning: {message}", "yellow") result = message except Exception as e: - message = f"An error occurred: {e}" + full_traceback = traceback.format_exc() + message = f"An error occurred: {full_traceback}" print_with_color(f"Warning: {message}", "yellow") result = message return result @@ -76,6 +90,50 @@ def click_input(self, params: Dict[str, Union[str, bool]]) -> str: else: return self.atomic_execution("click_input", params) + def click_on_coordinates(self, params: Dict[str, str]) -> str: + """ + Click on the coordinates of the control element. + :param params: The arguments of the click on coordinates method. + :return: The result of the click on coordinates action. + """ + + # Get the relative coordinates fraction of the application window. + x = float(params.get("x", 0)) + y = float(params.get("y", 0)) + + button = params.get("button", "left") + double = params.get("double", False) + + # Get the absolute coordinates of the application window. + tranformed_x, tranformed_y = self.transform_point(x, y) + + pyautogui.click( + tranformed_x, tranformed_y, button=button, clicks=2 if double else 1 + ) + + return "" + + def drag_on_coordinates(self, params: Dict[str, str]) -> str: + """ + Drag on the coordinates of the control element. + :param params: The arguments of the drag on coordinates method. + :return: The result of the drag on coordinates action. + """ + + start = self.transform_point( + float(params.get("start_x", 0)), float(params.get("start_y", 0)) + ) + end = self.transform_point( + float(params.get("end_x", 0)), float(params.get("end_y", 0)) + ) + + button = params.get("button", "left") + + pyautogui.moveTo(start[0], start[1]) + pyautogui.dragTo(end[0], end[1], button=button) + + return "" + def summary(self, params: Dict[str, str]) -> str: """ Visual summary of the control element. @@ -93,6 +151,7 @@ def set_edit_text(self, params: Dict[str, str]) -> str: """ text = params.get("text", "") + inter_key_pause = configs.get("INPUT_TEXT_INTER_KEY_PAUSE", 0.1) if configs["INPUT_TEXT_API"] == "set_text": method_name = "set_edit_text" @@ -102,7 +161,7 @@ def set_edit_text(self, params: Dict[str, str]) -> str: text = text.replace("\n", "{ENTER}") text = text.replace("\t", "{TAB}") - args = {"keys": text, "pause": 0.1, "with_spaces": True} + args = {"keys": text, "pause": inter_key_pause, "with_spaces": True} try: result = self.atomic_execution(method_name, args) if ( @@ -125,7 +184,11 @@ def set_edit_text(self, params: Dict[str, str]) -> str: text_to_type = args["text"] keys_to_send = clear_text_keys + text_to_type method_name = "type_keys" - args = {"keys": keys_to_send, "pause": 0.1, "with_spaces": True} + args = { + "keys": keys_to_send, + "pause": inter_key_pause, + "with_spaces": True, + } return self.atomic_execution(method_name, args) else: return f"An error occurred: {e}" @@ -136,7 +199,15 @@ def keyboard_input(self, params: Dict[str, str]) -> str: :param params: The arguments of the keyboard input method. :return: The result of the keyboard input action. """ - return self.atomic_execution("type_keys", params) + + control_focus = params.get("control_focus", True) + keys = params.get("keys", "") + + if control_focus: + self.atomic_execution("type_keys", {"keys": keys}) + else: + pyautogui.typewrite(keys) + return keys def texts(self) -> str: """ @@ -203,6 +274,24 @@ def wait_visible(self, timeout: int = 10, retry_interval: int = 0.5) -> None: warnings.warn(f"Timeout: {self.control} is not visible.") break + def transform_point(self, fraction_x: float, fraction_y: float) -> Tuple[int, int]: + """ + Transform the relative coordinates to the absolute coordinates. + :param fraction_x: The relative x coordinate. + :param fraction_y: The relative y coordinate. + :return: The absolute coordinates. + """ + application_rect: RECT = self.application.rectangle() + application_x = application_rect.left + application_y = application_rect.top + application_width = application_rect.width() + application_height = application_rect.height() + + x = application_x + int(application_width * fraction_x) + y = application_y + int(application_height * fraction_y) + + return x, y + @ReceiverManager.register class UIControlReceiverFactory(ReceiverFactory): @@ -315,6 +404,50 @@ def name(cls) -> str: return "click_input" +@ControlReceiver.register +class ClickOnCoordinatesCommand(ControlCommand): + """ + The click on coordinates command class. + """ + + def execute(self) -> str: + """ + Execute the click on coordinates command. + :return: The result of the click on coordinates command. + """ + return self.receiver.click_on_coordinates(self.params) + + @classmethod + def name(cls) -> str: + """ + Get the name of the atomic command. + :return: The name of the atomic command. + """ + return "click_on_coordinates" + + +@ControlReceiver.register +class DragOnCoordinatesCommand(ControlCommand): + """ + The drag on coordinates command class. + """ + + def execute(self) -> str: + """ + Execute the drag on coordinates command. + :return: The result of the drag on coordinates command. + """ + return self.receiver.drag_on_coordinates(self.params) + + @classmethod + def name(cls) -> str: + """ + Get the name of the atomic command. + :return: The name of the atomic command. + """ + return "drag_on_coordinates" + + @ControlReceiver.register class SummaryCommand(ControlCommand): """ diff --git a/ufo/automator/ui_control/inspector.py b/ufo/automator/ui_control/inspector.py index d56cb9fa..d8649c56 100644 --- a/ufo/automator/ui_control/inspector.py +++ b/ufo/automator/ui_control/inspector.py @@ -3,16 +3,22 @@ from __future__ import annotations +import functools +import time from abc import ABC, abstractmethod -from typing import Dict, List +from typing import Callable, Dict, List, Optional, cast +import comtypes.gen.UIAutomationClient as UIAutomationClient_dll import psutil +import pywinauto +import pywinauto.uia_defines +import uiautomation as auto from pywinauto import Desktop from pywinauto.controls.uiawrapper import UIAWrapper +from pywinauto.uia_element_info import UIAElementInfo from ufo.config.config import Config - configs = Config.get_instance().config_data @@ -76,6 +82,79 @@ def find_control_elements_in_descendants( pass +class UIAElementInfoFix(UIAElementInfo): + _cached_rect = None + _time_delay_marker = False + + def sleep(self, ms: float = 0): + import time + + if UIAElementInfoFix._time_delay_marker: + ms = max(20, ms) + else: + ms = max(1, ms) + time.sleep(ms / 1000.0) + UIAElementInfoFix._time_delay_marker = False + + @staticmethod + def _time_wrap(func): + def dec(self, *args, **kvargs): + name = func.__name__ + before = time.time() + result = func(self, *args, **kvargs) + if time.time() - before > 0.020: + print( + f"[❌][{name}][{hash(self._element)}] lookup took {(time.time() - before) * 1000:.2f} ms" + ) + UIAElementInfoFix._time_delay_marker = True + elif time.time() - before > 0.005: + print( + f"[⚠️][{name}][{hash(self._element)}]Control type lookup took {(time.time() - before) * 1000:.2f} ms" + ) + UIAElementInfoFix._time_delay_marker = True + else: + # print(f"[βœ…][{name}][{hash(self._element)}]Control type lookup took {(time.time() - before) * 1000:.2f} ms") + UIAElementInfoFix._time_delay_marker = False + return result + + return dec + + @_time_wrap + def _get_current_name(self): + return super()._get_current_name() + + @_time_wrap + def _get_current_rich_text(self): + return super()._get_current_rich_text() + + @_time_wrap + def _get_current_class_name(self): + return super()._get_current_class_name() + + @_time_wrap + def _get_current_control_type(self): + return super()._get_current_control_type() + + @_time_wrap + def _get_current_rectangle(self): + bound_rect = self._element.CurrentBoundingRectangle + rect = pywinauto.win32structures.RECT() + rect.left = bound_rect.left + rect.top = bound_rect.top + rect.right = bound_rect.right + rect.bottom = bound_rect.bottom + return rect + + def _get_cached_rectangle(self) -> tuple[int, int, int, int]: + if self._cached_rect is None: + self._cached_rect = self._get_current_rectangle() + return self._cached_rect + + @property + def rectangle(self): + return self._get_cached_rectangle() + + class UIABackendStrategy(BackendStrategy): """ The backend strategy for UIA. @@ -87,7 +166,13 @@ def get_desktop_windows(self, remove_empty: bool) -> List[UIAWrapper]: :param remove_empty: Whether to remove empty titles. :return: The apps on the desktop. """ - desktop_windows = Desktop(backend="uia").windows() + + # UIA Com API would incur severe performance occasionally (such as a new app just started) + # so we use Win32 to acquire the handle and then convert it to UIA interface + + desktop_windows = Desktop(backend="win32").windows() + desktop_windows = [app for app in desktop_windows if app.is_visible()] + if remove_empty: desktop_windows = [ app @@ -95,11 +180,16 @@ def get_desktop_windows(self, remove_empty: bool) -> List[UIAWrapper]: if app.window_text() != "" and app.element_info.class_name not in ["IME", "MSCTFIME UI"] ] - return desktop_windows + + uia_desktop_windows: List[UIAWrapper] = [ + UIAWrapper(UIAElementInfo(handle_or_elem=window.handle)) + for window in desktop_windows + ] + return uia_desktop_windows def find_control_elements_in_descendants( self, - window: UIAWrapper, + window: Optional[UIAWrapper], control_type_list: List[str] = [], class_name_list: List[str] = [], title_list: List[str] = [], @@ -119,45 +209,155 @@ def find_control_elements_in_descendants( :return: The control elements found. """ - if window == None: + if window is None: return [] - control_elements = [] - if len(control_type_list) == 0: - control_elements += window.descendants() - else: - for control_type in control_type_list: - if depth == 0: - subcontrols = window.descendants(control_type=control_type) - else: - subcontrols = window.descendants( - control_type=control_type, depth=depth - ) - control_elements += subcontrols + assert ( + class_name_list is None or len(class_name_list) == 0 + ), "class_name_list is not supported for UIA backend" - if is_visible: - control_elements = [ - control for control in control_elements if control.is_visible() - ] - if is_enabled: - control_elements = [ - control for control in control_elements if control.is_enabled() - ] - if len(title_list) > 0: - control_elements = [ - control - for control in control_elements - if control.window_text() in title_list - ] - if len(class_name_list) > 0: - control_elements = [ - control - for control in control_elements - if control.element_info.class_name in class_name_list - ] + _, iuia_dll = UIABackendStrategy._get_uia_defs() + window_elem_info = cast(UIAElementInfo, window.element_info) + window_elem_com_ref = cast( + UIAutomationClient_dll.IUIAutomationElement, window_elem_info._element + ) + + condition = UIABackendStrategy._get_control_filter_condition( + control_type_list, + is_visible, + is_enabled, + ) + + cache_request = UIABackendStrategy._get_cache_request() + + com_elem_array = window_elem_com_ref.FindAllBuildCache( + scope=iuia_dll.TreeScope_Descendants, + condition=condition, + cacheRequest=cache_request, + ) + + elem_info_list = [ + ( + elem, + elem.CachedControlType, + elem.CachedName, + elem.CachedBoundingRectangle, + ) + for elem in ( + com_elem_array.GetElement(n) + for n in range(min(com_elem_array.Length, 300)) + ) + ] + + control_elements: List[UIAWrapper] = [] + + for elem, elem_type, elem_name, elem_rect in elem_info_list: + element_info = UIAElementInfoFix(elem, True) + elem_type_name = UIABackendStrategy._get_uia_control_name_map().get( + elem_type, "" + ) + + # handle is not needed, skip fetching + element_info._cached_handle = 0 + + # visibility is determined by filter condition + element_info._cached_visible = True + + # fill the values with pre-fetched data + rect = pywinauto.win32structures.RECT() + rect.left = elem_rect.left + rect.top = elem_rect.top + rect.right = elem_rect.right + rect.bottom = elem_rect.bottom + element_info._cached_rect = rect + element_info._cached_name = elem_name + element_info._cached_control_type = elem_type_name + + # currently rich text is not used, skip fetching but use name as alternative + # this could be reverted if some control requires rich text + element_info._cached_rich_text = elem_name + + # class name is not used directly, could pre-fetch in future + # element_info.class_name + + uia_interface = UIAWrapper(element_info) + + def __hash__(self): + return hash(self.element_info._element) + + # current __hash__ is not referring to a COM property (RuntimeId), which is costly to fetch + uia_interface.__hash__ = __hash__ + + control_elements.append(uia_interface) return control_elements + @staticmethod + def _get_uia_control_id_map(): + iuia = pywinauto.uia_defines.IUIA() + return iuia.known_control_types + + @staticmethod + def _get_uia_control_name_map(): + iuia = pywinauto.uia_defines.IUIA() + return iuia.known_control_type_ids + + @staticmethod + @functools.lru_cache() + def _get_cache_request(): + iuia_com, iuia_dll = UIABackendStrategy._get_uia_defs() + cache_request = iuia_com.CreateCacheRequest() + cache_request.AddProperty(iuia_dll.UIA_ControlTypePropertyId) + cache_request.AddProperty(iuia_dll.UIA_NamePropertyId) + cache_request.AddProperty(iuia_dll.UIA_BoundingRectanglePropertyId) + return cache_request + + @staticmethod + def _get_control_filter_condition( + control_type_list: List[str] = [], + is_visible: bool = True, + is_enabled: bool = True, + ): + iuia_com, iuia_dll = UIABackendStrategy._get_uia_defs() + condition = iuia_com.CreateAndConditionFromArray( + [ + iuia_com.CreatePropertyCondition( + iuia_dll.UIA_IsEnabledPropertyId, is_enabled + ), + iuia_com.CreatePropertyCondition( + # visibility is determined by IsOffscreen property + iuia_dll.UIA_IsOffscreenPropertyId, + not is_visible, + ), + iuia_com.CreatePropertyCondition( + iuia_dll.UIA_IsControlElementPropertyId, True + ), + iuia_com.CreateOrConditionFromArray( + [ + iuia_com.CreatePropertyCondition( + iuia_dll.UIA_ControlTypePropertyId, + ( + control_type + if control_type is int + else UIABackendStrategy._get_uia_control_id_map()[ + control_type + ] + ), + ) + for control_type in control_type_list + ] + ), + ] + ) + return condition + + @staticmethod + def _get_uia_defs(): + iuia = pywinauto.uia_defines.IUIA() + iuia_com: UIAutomationClient_dll.IUIAutomation = iuia.iuia + iuia_dll: UIAutomationClient_dll = iuia.UIA_dll + return iuia_com, iuia_dll + class Win32BackendStrategy(BackendStrategy): """ @@ -369,6 +569,39 @@ def get_control_info_list_of_dict( control_info_list.append(control_info) return control_info_list + @staticmethod + def get_check_state(control_item: auto.Control) -> bool | None: + """ + get the check state of the control item + param control_item: the control item to get the check state + return: the check state of the control item + """ + is_checked = None + is_selected = None + try: + assert isinstance( + control_item, auto.Control + ), f"{control_item =} is not a Control" + is_checked = ( + control_item.GetLegacyIAccessiblePattern().State + & auto.AccessibleState.Checked + == auto.AccessibleState.Checked + ) + if is_checked: + return is_checked + is_selected = ( + control_item.GetLegacyIAccessiblePattern().State + & auto.AccessibleState.Selected + == auto.AccessibleState.Selected + ) + if is_selected: + return is_selected + return None + except Exception as e: + # print(f'item {control_item} not available for check state.') + # print(e) + return None + @staticmethod def get_control_info( window: UIAWrapper, field_list: List[str] = [] @@ -379,22 +612,26 @@ def get_control_info( :param field_list: The fields to get. return: The control info of the window. """ - control_info = {} + control_info: Dict[str, str] = {} + + def assign(prop_name: str, prop_value_func: Callable[[], str]) -> None: + if len(field_list) > 0 and prop_name not in field_list: + return + control_info[prop_name] = prop_value_func() + try: - control_info["control_type"] = window.element_info.control_type - control_info["control_id"] = window.element_info.control_id - control_info["control_class"] = window.element_info.class_name - control_info["control_name"] = window.element_info.name - control_info["control_rect"] = window.element_info.rectangle - control_info["control_text"] = window.element_info.name - control_info["control_title"] = window.window_text() + assign("control_type", lambda: window.element_info.control_type) + assign("control_id", lambda: window.element_info.control_id) + assign("control_class", lambda: window.element_info.class_name) + assign("control_name", lambda: window.element_info.name) + assign("control_rect", lambda: window.element_info.rectangle) + assign("control_text", lambda: window.element_info.name) + assign("control_title", lambda: window.window_text()) + assign("selected", lambda: ControlInspectorFacade.get_check_state(window)) + return control_info except: return {} - if len(field_list) > 0: - control_info = {field: control_info[field] for field in field_list} - return control_info - @staticmethod def get_application_root_name(window: UIAWrapper) -> str: """ diff --git a/ufo/automator/ui_control/screenshot.py b/ufo/automator/ui_control/screenshot.py index 6b411032..5f992ed5 100644 --- a/ufo/automator/ui_control/screenshot.py +++ b/ufo/automator/ui_control/screenshot.py @@ -2,11 +2,12 @@ # Licensed under the MIT License. import base64 +import functools import mimetypes import os from abc import ABC, abstractmethod from io import BytesIO -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from PIL import Image, ImageDraw, ImageFont, ImageGrab from pywinauto.controls.uiawrapper import UIAWrapper @@ -16,6 +17,8 @@ configs = Config.get_instance().config_data +DEFAULT_PNG_COMPRESS_LEVEL = int(configs["DEFAULT_PNG_COMPRESS_LEVEL"]) + class Photographer(ABC): """ @@ -47,7 +50,7 @@ def capture(self, save_path: str = None): # Capture single window screenshot screenshot = self.control.capture_as_image() if save_path is not None: - screenshot.save(save_path) + screenshot.save(save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) return screenshot @@ -71,7 +74,7 @@ def capture(self, save_path: str = None): """ screenshot = ImageGrab.grab(all_screens=self.all_screens) if save_path is not None: - screenshot.save(save_path) + screenshot.save(save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) return screenshot @@ -96,12 +99,12 @@ def capture(self, save_path=None): return self.photographer.capture(save_path) @staticmethod - def coordinate_adjusted(window_rect: RECT, control_rect: RECT): + def coordinate_adjusted(window_rect: RECT, control_rect: RECT) -> Tuple: """ Adjust the coordinates of the control rectangle to the window rectangle. :param window_rect: The window rectangle. :param control_rect: The control rectangle. - :return: The adjusted control rectangle. + :return: The adjusted control rectangle (left, top, right, bottom), relative to the window rectangle. """ # (left, top, right, bottom) adjusted_rect = ( @@ -172,7 +175,7 @@ def capture(self, save_path: str): screenshot, coordinate=adjusted_rect, color=self.color ) if save_path is not None: - screenshot.save(save_path) + screenshot.save(save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) return screenshot @@ -228,8 +231,31 @@ def draw_rectangles_controls( :param button_color: The color of the button. return: The image with the control rectangle and label. """ - _ = ImageDraw.Draw(image) - font = ImageFont.truetype("arial.ttf", font_size) + button_img = AnnotationDecorator._get_button_img( + label_text, + botton_margin=botton_margin, + border_width=border_width, + font_size=font_size, + font_color=font_color, + border_color=border_color, + button_color=button_color, + ) + # put button on source image + image.paste(button_img, (coordinate[0], coordinate[1])) + return image + + @staticmethod + @functools.lru_cache(maxsize=2048, typed=False) + def _get_button_img( + label_text: str, + botton_margin: int = 5, + border_width: int = 2, + font_size: int = 25, + font_color: str = "#000000", + border_color: str = "#FF0000", + button_color: str = "#FFF68F", + ): + font = AnnotationDecorator._get_font("arial.ttf", font_size) text_size = font.getbbox(label_text) # set button size + margins @@ -250,10 +276,12 @@ def draw_rectangles_controls( outline=border_color, width=border_width, ) + return button_img - # put button on source image - image.paste(button_img, (coordinate[0], coordinate[1])) - return image + @staticmethod + @functools.lru_cache(maxsize=64, typed=False) + def _get_font(name: str, size: int): + return ImageFont.truetype(name, size) @staticmethod def number_to_letter(n: int): @@ -335,7 +363,9 @@ def capture_with_annotation_dict( ) if save_path is not None: - screenshot_annotated.save(save_path) + screenshot_annotated.save( + save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL + ) return screenshot_annotated @@ -537,7 +567,7 @@ def concat_screenshots( result.paste(image2, (image1.width, 0)) # Save the result - result.save(output_path) + result.save(output_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) return result @@ -550,7 +580,7 @@ def image_to_base64(image: Image.Image) -> str: :return: The base64 string. """ buffered = BytesIO() - image.save(buffered, format="PNG") + image.save(buffered, format="PNG", optimize=True) return base64.b64encode(buffered.getvalue()).decode("utf-8") @staticmethod diff --git a/ufo/config/config_dev.yaml b/ufo/config/config_dev.yaml index c3c623ed..d62e0d25 100644 --- a/ufo/config/config_dev.yaml +++ b/ufo/config/config_dev.yaml @@ -1,10 +1,13 @@ CONTROL_BACKEND: "uia" # The backend for control action, currently we support uia and win32 MAX_STEP: 100 # The max step limit for completing the user request -SLEEP_TIME: 5 # The sleep time between each step to wait for the window to be ready +SLEEP_TIME: 3 # The sleep time between each step to wait for the window to be ready RECTANGLE_TIME: 1 +# Skip rendering visual outline on screen if not necessary +SHOW_VISUAL_OUTLINE_ON_SCREEN: False + SAFE_GUARD: True # Whether to use the safe guard to prevent the model from doing sensitve operations. -CONTROL_LIST: ["Button", "Edit", "TabItem", "Document", "ListItem", "MenuItem", "ScrollBar", "TreeItem", "Hyperlink", "ComboBox", "RadioButton", "DataItem"] +CONTROL_LIST: ["Button", "Edit", "TabItem", "Document", "ListItem", "MenuItem", "ScrollBar", "TreeItem", "Hyperlink", "ComboBox", "RadioButton", "DataItem", "Spinner"] # The list of widgets that allowed to be selected, in uia backend, it will be used for filter the control_type, while in win32 backend, it will be used for filter the class_name. HISTORY_KEYS: ["Step", "Thought", "ControlText", "Subtask", "Action", "Comment", "Results", "UserConfirm"] # The keys of the action history for the next step. ANNOTATION_COLORS: { @@ -49,8 +52,10 @@ DEMONSTRATION_SAVED_PATH: "vectordb/demonstration/" API_PROMPT: "ufo/prompts/share/base/api.yaml" # The prompt for the API CLICK_API: "click_input" # The click API +AFTER_CLICK_WAIT: 0 # The wait time after clicking in seconds, even if the value is 0, there will be a small pause. Default 0.09 in pywinauto. INPUT_TEXT_API: "type_keys" # The input text API. Can be "type_keys" or "set_text" INPUT_TEXT_ENTER: False # whether to press enter after typing the text +INPUT_TEXT_INTER_KEY_PAUSE: 0.05 # The pause time between each key press in seconds. Even if the value is 0, there will be a small pause. Default 0.05 in pywinauto ## APIs related @@ -88,3 +93,6 @@ QA_PAIR_NUM: 20 # The number of QA pairs for the customization EVA_SESSION: True # Whether to include the session in the evaluation EVA_ROUND: FALSE EVA_ALL_SCREENSHOTS: True # Whether to include all the screenshots in the evaluation + +# Image saving performance +DEFAULT_PNG_COMPRESS_LEVEL: 1 # The compress level for the PNG image, 0-9, 0 is no compress, 1 is the fastest, 9 is the best compress diff --git a/ufo/experience/summarizer.py b/ufo/experience/summarizer.py index be8652a6..dd000e43 100644 --- a/ufo/experience/summarizer.py +++ b/ufo/experience/summarizer.py @@ -6,13 +6,12 @@ import yaml from langchain.docstore.document import Document -from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from ufo.experience.parser import ExperienceLogLoader from ufo.llm.llm_call import get_completion from ufo.prompter.experience_prompter import ExperiencePrompter -from ufo.utils import json_parser +from ufo.utils import get_hugginface_embedding, json_parser class ExperienceSummarizer: @@ -175,14 +174,11 @@ def create_or_update_vector_db(summaries: list, db_path: str): request = summary["request"] document_list.append(Document(page_content=request, metadata=summary)) - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.from_documents(document_list, embeddings) + db = FAISS.from_documents(document_list, get_hugginface_embedding()) # Check if the db exists, if not, create a new one. if os.path.exists(db_path): - prev_db = FAISS.load_local(db_path, embeddings) + prev_db = FAISS.load_local(db_path, get_hugginface_embedding()) db.merge_from(prev_db) db.save_local(db_path) diff --git a/ufo/llm/openai.py b/ufo/llm/openai.py index ec4823cf..dce02003 100644 --- a/ufo/llm/openai.py +++ b/ufo/llm/openai.py @@ -2,10 +2,14 @@ # Licensed under the MIT License. import datetime -from typing import Any, Optional +import os +import shutil +import sys +from typing import Any, Callable, Literal, Optional import openai from openai import AzureOpenAI, OpenAI +import functools from ufo.llm.base import BaseService @@ -27,28 +31,17 @@ def __init__(self, config, agent_type: str) -> None: self.max_retry = self.config["MAX_RETRY"] self.prices = self.config["PRICES"] assert self.api_type in ["openai", "aoai", "azure_ad"], "Invalid API type" - self.client: OpenAI = ( - OpenAI( - base_url=self.config_llm["API_BASE"], - api_key=self.config_llm["API_KEY"], - max_retries=self.max_retry, - timeout=self.config["TIMEOUT"], - ) - if self.api_type == "openai" - else AzureOpenAI( - max_retries=self.max_retry, - timeout=self.config["TIMEOUT"], - api_version=self.config_llm["API_VERSION"], - azure_endpoint=self.config_llm["API_BASE"], - api_key=( - self.config_llm["API_KEY"] - if self.api_type == "aoai" - else self.get_openai_token() - ), - ) + + self.client: OpenAI = OpenAIService.get_openai_client( + self.api_type, + self.config_llm["API_BASE"], + self.max_retry, + self.config["TIMEOUT"], + self.config_llm.get("API_KEY", ""), + self.config_llm.get("API_VERSION", ""), + aad_api_scope_base=self.config_llm.get("AAD_API_SCOPE_BASE", ""), + aad_tenant_id=self.config_llm.get("AAD_TENANT_ID", ""), ) - if self.api_type == "azure_ad": - self.auto_refresh_token() def chat_completion( self, @@ -138,185 +131,229 @@ def chat_completion( # Handle API error, e.g. retry or log raise Exception(f"OpenAI API returned an API Error: {e}") - def get_openai_token( - self, - token_cache_file: str = "apim-token-cache.bin", + @functools.lru_cache() + @staticmethod + def get_openai_client( + api_type: str, + api_base: str, + max_retry: int, + timeout: int, + api_key: Optional[str] = None, + api_version: Optional[str] = None, + aad_api_scope_base: Optional[str] = None, + aad_tenant_id: Optional[str] = None, + ) -> OpenAI: + if api_type == "openai": + assert api_key, "OpenAI API key must be specified" + assert api_base, "OpenAI API base URL must be specified" + client = OpenAI( + base_url=api_base, + api_key=api_key, + max_retries=max_retry, + timeout=timeout, + ) + else: + assert api_version, "Azure OpenAI API version must be specified" + if api_type == "aoai": + assert api_key, "Azure OpenAI API key must be specified" + client = AzureOpenAI( + max_retries=max_retry, + timeout=timeout, + api_version=api_version, + azure_endpoint=api_base, + api_key=api_key, + ) + else: + assert ( + aad_api_scope_base and aad_tenant_id + ), "AAD API scope base and tenant ID must be specified" + token_provider = OpenAIService.get_aad_token_provider( + aad_api_scope_base=aad_api_scope_base, + aad_tenant_id=aad_tenant_id, + ) + client = AzureOpenAI( + max_retries=max_retry, + timeout=timeout, + api_version=api_version, + azure_endpoint=api_base, + azure_ad_token_provider=token_provider, + ) + return client + + @functools.lru_cache() + @staticmethod + def get_aad_token_provider( + aad_api_scope_base: str, + aad_tenant_id: str, + token_cache_file: str = "aoai-token-cache.bin", client_id: Optional[str] = None, client_secret: Optional[str] = None, - ) -> str: + use_azure_cli: Optional[bool] = None, + use_broker_login: Optional[bool] = None, + use_managed_identity: Optional[bool] = None, + use_device_code: Optional[bool] = None, + **kwargs, + ) -> Callable[[], str]: """ - acquire token from Azure AD for your organization + acquire token from Azure AD for OpenAI Parameters ---------- token_cache_file : str, optional - path to the token cache file, by default 'apim-token-cache.bin' in the current directory + path to the token cache file, by default 'aoai-token-cache.bin' in the current directory client_id : Optional[str], optional client id for AAD app, by default None client_secret : Optional[str], optional client secret for AAD app, by default None + use_azure_cli : Optional[bool], optional + use Azure CLI for authentication, by default None. If AzCli has been installed and logged in, + it will be used for authentication. This is recommended for headless environments and AzCLI takes + care of token cache and token refresh. + use_broker_login : Optional[bool], optional + use broker login for authentication, by default None. + If not specified, it will be enabled for known supported environments (e.g. Windows, macOS, WSL, VSCode), + but sometimes it may not always could cache the token for long-term usage. + In such cases, you can disable it by setting it to False. + use_managed_identity : Optional[bool], optional + use managed identity for authentication, by default None. + If not specified, it will use user assigned managed identity if client_id is specified, + For use system assigned managed identity, client_id could be None but need to set use_managed_identity to True. + use_device_code : Optional[bool], optional + use device code for authentication, by default None. If not specified, it will use interactive login on supported platform. Returns ------- str - access token for your own organization + access token for OpenAI """ - import os import msal + from azure.identity import ( + AuthenticationRecord, + AzureCliCredential, + ClientSecretCredential, + DeviceCodeCredential, + ManagedIdentityCredential, + TokenCachePersistenceOptions, + get_bearer_token_provider, + ) + from azure.identity.broker import InteractiveBrowserBrokerCredential - cache = msal.SerializableTokenCache() - - def save_cache(): - if token_cache_file is not None and cache.has_state_changed: - with open(token_cache_file, "w") as cache_file: - cache_file.write(cache.serialize()) + api_scope_base = "api://" + aad_api_scope_base - if os.path.exists(token_cache_file): - cache.deserialize(open(token_cache_file, "r").read()) + tenant_id = aad_tenant_id + scope = api_scope_base + "/.default" - authority = ( - "https://login.microsoftonline.com/" + self.config_llm["AAD_TENANT_ID"] + token_cache_option = TokenCachePersistenceOptions( + name=token_cache_file, + enable_persistence=True, + allow_unencrypted_storage=True, ) - api_scope_base = "api://" + self.config_llm["AAD_API_SCOPE_BASE"] - - if client_id is not None and client_secret is not None: - app = msal.ConfidentialClientApplication( - client_id=client_id, - client_credential=client_secret, - authority=authority, - token_cache=cache, - ) - result = app.acquire_token_for_client( - scopes=[ - api_scope_base + "/.default", - ] - ) - if "access_token" in result: - return result["access_token"] - else: - print(result.get("error")) - print(result.get("error_description")) - raise Exception( - "Authentication failed for acquiring AAD token for your organization" - ) - scopes = [api_scope_base + "/" + self.config_llm["AAD_API_SCOPE"]] - app = msal.PublicClientApplication( - self.config_llm["AAD_API_SCOPE_BASE"], - authority=authority, - token_cache=cache, - ) - result = None - for account in app.get_accounts(): + def save_auth_record(auth_record: AuthenticationRecord): try: - result = app.acquire_token_silent(scopes, account=account) - if result is not None and "access_token" in result: - save_cache() - return result["access_token"] - result = None - except Exception: - continue - - accounts_in_cache = cache.find(msal.TokenCache.CredentialType.ACCOUNT) - for account in accounts_in_cache: + with open(token_cache_file, "w") as cache_file: + cache_file.write(auth_record.serialize()) + except Exception as e: + print("failed to save auth record", e) + + def load_auth_record() -> Optional[AuthenticationRecord]: try: - refresh_token = cache.find( - msal.CredentialType.REFRESH_TOKEN, - query={"home_account_id": account["home_account_id"]}, - )[0] - result = app.acquire_token_by_refresh_token( - refresh_token["secret"], scopes=scopes + if not os.path.exists(token_cache_file): + return None + with open(token_cache_file, "r") as cache_file: + return AuthenticationRecord.deserialize(cache_file.read()) + except Exception as e: + print("failed to load auth record", e) + return None + + auth_record: Optional[AuthenticationRecord] = load_auth_record() + + current_auth_mode: Literal[ + "client_secret", + "managed_identity", + "az_cli", + "interactive", + "device_code", + "none", + ] = "none" + + implicit_mode = not ( + use_managed_identity or use_azure_cli or use_broker_login or use_device_code + ) + + if use_managed_identity or (implicit_mode and client_id is not None): + if not use_managed_identity and client_secret is not None: + assert ( + client_id is not None + ), "client_id must be specified with client_secret" + current_auth_mode = "client_secret" + identity = ClientSecretCredential( + client_id=client_id, + client_secret=client_secret, + tenant_id=tenant_id, + cache_persistence_options=token_cache_option, + authentication_record=auth_record, ) - if result is not None and "access_token" in result: - save_cache() - return result["access_token"] - result = None - except Exception: - pass - - if result is None: - print("no token available from cache, acquiring token from AAD") - # The pattern to acquire a token looks like this. - flow = app.initiate_device_flow(scopes=scopes) - print(flow["message"]) - result = app.acquire_token_by_device_flow(flow=flow) - if result is not None and "access_token" in result: - save_cache() - return result["access_token"] else: - print(result.get("error")) - print(result.get("error_description")) - raise Exception( - "Authentication failed for acquiring AAD token for your organization" + current_auth_mode = "managed_identity" + if client_id is None: + # using default managed identity + identity = ManagedIdentityCredential( + cache_persistence_options=token_cache_option, + ) + else: + identity = ManagedIdentityCredential( + client_id=client_id, + cache_persistence_options=token_cache_option, + ) + elif use_azure_cli or (implicit_mode and shutil.which("az") is not None): + current_auth_mode = "az_cli" + identity = AzureCliCredential(tenant_id=tenant_id) + else: + if implicit_mode: + # enable broker login for known supported envs if not specified using use_device_code + if sys.platform.startswith("darwin") or sys.platform.startswith( + "win32" + ): + use_broker_login = True + elif os.environ.get("WSL_DISTRO_NAME", "") != "": + use_broker_login = True + elif os.environ.get("TERM_PROGRAM", "") == "vscode": + use_broker_login = True + else: + use_broker_login = False + if use_broker_login: + current_auth_mode = "interactive" + identity = InteractiveBrowserBrokerCredential( + tenant_id=tenant_id, + cache_persistence_options=token_cache_option, + use_default_broker_account=True, + parent_window_handle=msal.PublicClientApplication.CONSOLE_WINDOW_HANDLE, + authentication_record=auth_record, + ) + else: + current_auth_mode = "device_code" + identity = DeviceCodeCredential( + tenant_id=tenant_id, + cache_persistence_options=token_cache_option, + authentication_record=auth_record, ) - def auto_refresh_token( - self, - token_cache_file: str = "apim-token-cache.bin", - interval: datetime.timedelta = datetime.timedelta(minutes=15), - on_token_update: callable = None, - client_id: Optional[str] = None, - client_secret: Optional[str] = None, - ) -> callable: - """ - helper function for auto refreshing token from your organization - - Parameters - ---------- - token_cache_file : str, optional - path to the token cache file, by default 'apim-token-cache.bin' in the current directory - interval : datetime.timedelta, optional - interval for refreshing token, by default 15 minutes - on_token_update : callable, optional - callback function to be called when token is updated, by default None. In the callback function, you can get token from openai.api_key - - Returns - ------- - callable - a callable function that can be used to stop the auto refresh thread - """ - - import threading - - def update_token(): - import openai - - openai.api_type = ( - "azure" - if self.config_llm["API_TYPE"] == "azure_ad" - else self.config_llm["API_TYPE"] - ) - openai.base_url = self.config_llm["API_BASE"] - openai.api_version = self.config_llm["API_VERSION"] - openai.api_key = self.get_openai_token( - token_cache_file=token_cache_file, - client_id=client_id, - client_secret=client_secret, - ) - - if on_token_update is not None: - on_token_update() - - def refresh_token_thread(): - import time - - while True: - try: - update_token() - except Exception as e: - print("failed to acquire token from AAD for your organization", e) - time.sleep(interval.total_seconds()) + try: + auth_record = identity.authenticate(scopes=[scope]) + if auth_record: + save_auth_record(auth_record) + + except Exception as e: + print( + f"failed to acquire token from AAD for OpenAI using {current_auth_mode}", + e, + ) + raise e try: - update_token() + return get_bearer_token_provider(identity, scope) except Exception as e: - raise Exception("failed to acquire token from AAD for your organization", e) - - thread = threading.Thread(target=refresh_token_thread, daemon=True) - thread.start() - - def stop(): - thread.stop() - - return stop + print("failed to acquire token from AAD for OpenAI", e) + raise e diff --git a/ufo/prompts/apps/excel/api.yaml b/ufo/prompts/apps/excel/api.yaml index 6da08bd5..b047f740 100644 --- a/ufo/prompts/apps/excel/api.yaml +++ b/ufo/prompts/apps/excel/api.yaml @@ -4,10 +4,10 @@ table2markdown: class_name: |- GetSheetContent usage: |- - [1] API call: table2markdown(sheet_name: str) + [1] API call: table2markdown(sheet_name: Union[str, int]) [2] Args: - - sheet_name: The name of the sheet in the Excel app. - [3] Example: table2markdown(sheet_name="Sheet1") + - sheet_name: The name or index of the sheet to get the table content. The index starts from 1. + [3] Example: table2markdown(sheet_name=1) [4] Available control item: Any control item in the Excel app. [5] Return: the markdown format string of the table content of the sheet. diff --git a/ufo/prompts/apps/web/api.yaml b/ufo/prompts/apps/web/api.yaml index fce13a9c..ebff6076 100644 --- a/ufo/prompts/apps/web/api.yaml +++ b/ufo/prompts/apps/web/api.yaml @@ -1,6 +1,6 @@ web_crawler: summary: |- - "web_crawler" is to run crawler to fetch content from the entire web page, and return markdown format of the web page. You should call this API when you want to fetch the content of the web page for further processing, extraction, or analysis. + "web_crawler" is to run crawler to fetch content from the entire web page, and return markdown format of the web page. You should call this API when you want to fetch the content of the web page for further processing, extraction, or analysis. Note that the API will return the entire content of the web page, and you need to extract the information you need by yourself. Do not use this API to fetch the content of the web page that you do not have permission to access. class_name: |- WebCrawlerCommand usage: |- @@ -10,4 +10,4 @@ web_crawler: - ignore_link: Whether to ignore the link in the html data. Default is False. [3] Example: web_crawler(url='https://www.google.com', ignore_link=False) [4] Available control item: All - [5] Return: The webpage content string in markdown format. \ No newline at end of file + [5] Return: The entire webpage content string in a markdown format. \ No newline at end of file diff --git a/ufo/prompts/examples/visual/app_agent_example.yaml b/ufo/prompts/examples/visual/app_agent_example.yaml index 3555334f..7a69f6be 100644 --- a/ufo/prompts/examples/visual/app_agent_example.yaml +++ b/ufo/prompts/examples/visual/app_agent_example.yaml @@ -378,3 +378,63 @@ example11: Tips: |- - The Copilot Add-in can directly create a presentation from the plant.pptx. You need to use the Copilot Add-in to complete the task. + +example12: + Request: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Sub-task: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft PowerPoint application. The first slide of the presentation.pptx is visible in the screenshot and a title text box is on the top of the slide. + Thought: |- + I need to input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. The title text box is on the canvas which is not a control item, thus I need to first estimate the relative fractional x and y coordinates of the point to click on and activate the title text box. The estimated coordinates of the point to click on are (0.35, 0.4). + ControlLabel: |- + "" + ControlText: |- + "" + Function: |- + click_on_coordinates + Args: + {"x": 0.35, "y": 0.4, "button": "left", "double": false} + Status: |- + CONTINUE + Plan: + - (1) Input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to estimate the relative fractional x and y coordinates of the point to click on and activate the title text box, so that I can input the title 'Project Update'. + Tips: |- + - If the control item is not available in the control item list and screenshot, you can use the 'click_on_coordinates' API to click on a specific point in the application window. + + +example13: + Request: |- + Fill the information for top 3 events one by one in the forms of private Event Bookings web page. + Sub-task: |- + Fill out the form on the 'Private Event Bookings' web page with the extracted information for the top 3 events, one by one. + Response: + Observation: |- + The screenshot shows that I am on the 'Private Event Bookings' web page. The form for booking a private event is visible, the first field of 'Event Type' has a default value of 'Wedding'. + Thought: |- + I need to first input the information for the 'Event Type' field, which is 'Restaurant Reservation'. However, the 'Event Type' field is already filled with 'Wedding'. I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation'. + ControlLabel: |- + 70 + ControlText: |- + Event Type + Function: |- + click_input + Args: + {"button": "left", "double": false} + Status: |- + CONTINUE + Plan: + - (1) Click the 'Event Type' field to open the dropdown list. + - (2) Select 'Restaurant Reservation' from the dropdown list. + - (3) Input the information for the 'Event Name' field, which is 'Birthday Party'. + - (4) Input the information for the 'Event Date' field, which is '2022-12-25'. + - (5) Input the information for the 'Event Time' field, which is '18:00'. + - (6) Click the 'Submit' button to submit the form. + Comment: |- + I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation' to change the default value of 'Wedding'. diff --git a/ufo/prompts/examples/visual/host_agent_example.yaml b/ufo/prompts/examples/visual/host_agent_example.yaml index e4bcfb14..1376c537 100644 --- a/ufo/prompts/examples/visual/host_agent_example.yaml +++ b/ufo/prompts/examples/visual/host_agent_example.yaml @@ -297,6 +297,35 @@ example11: None +example12: + Request: |- + Go to the web page of Bikes - Trek Bikes, and get the name of each bike and its price of the whole page and its following pages into a table in Book1 excel file. You need to get information for first 3 pages. + Response: + Observation: |- + The current screenshot shows the Edge browser is visible. + Thought: |- + The user request should be completed on the Edge browser first. After all the information is collected, I need to switch to the Excel application to write the information into the Book1 excel file. + CurrentSubtask: + Extract the name and price of each bike from the 'Bikes - Trek Bikes' web page, including all its following pages. + Message: + - (1) You should use the web_crawler API to get markdown content of the web page, and extract the name and price of each bike by yourself. + - (2) There may be multiple pages of the website. Once you finish extracting the information from the current page, you need to scroll down to the bottom of the page to find a button to navigate to the next page. + - (3) Check if the results contain the information of required information. If so, do not repeat the extraction with the web_crawler API. + - (3) You must extract the information for first 3 pages. + ControlLabel: |- + 4 + ControlText: |- + Bikes - Trek Bikes - Microsoft Edge + Status: |- + CONTINUE + Plan: + - Create a table in Book1 excel file to write the information of the name and price of each bike. + Comment: |- + I need to first use the web_crawler API to get the name and price of each bike on the web page of Bikes - Trek Bikes. + Questions: [] + AppsToOpen: |- + None + example_openapp1: Request: open a ppt file on my desktop named test.pptx and modify the title to Apple is the best tech Company diff --git a/ufo/prompts/share/base/api.yaml b/ufo/prompts/share/base/api.yaml index 7632fd51..aa3b39e5 100644 --- a/ufo/prompts/share/base/api.yaml +++ b/ufo/prompts/share/base/api.yaml @@ -13,6 +13,41 @@ click_input: [5] Return: None +click_on_coordinates: + summary: |- + "click_on_coordinates" is to click on the specific coordinates in the application window, instead of clicking on a specific control item. This API is useful when the control item is not available in the control item list and screenshot, but you want to click on a specific point in the application window. When you use this API, you must estimate the relative fractional x and y coordinates of the point to click on, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + class_name: |- + ClickOnCoordinatesCommand + usage: |- + [1] API call: click_on_coordinates(x: float, y: float, button: str, double: bool) + [2] Args: + - x: The relative fractional x-coordinate of the point to click on, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - y: The relative fractional y-coordinate of the point to click on, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - button: The mouse button to click. One of 'left', 'right'. (Default: 'left') + - double: Whether to perform a double click or not. (Default: False) + [3] Example: click_on_coordinates(x=0.5, y=0.5, button="left", double=False) + [4] Available control item: Control item is not required for this API. + [5] Return: None + + +drag_on_coordinates: + summary: |- + "drag_on_coordinates" is to drag from one point to another point in the application window, instead of dragging a specific control item. This API is useful when the control item is not available in the control item list and screenshot, but you want to drag from one point to another point in the application window. When you use this API, you must estimate the relative fractional x and y coordinates of the starting point and ending point to drag from and to, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + class_name: |- + DragOnCoordinatesCommand + usage: |- + [1] API call: drag_on_coordinates(start_x: float, start_y: float, end_x: float, end_y: float, button: str = "left") + [2] Args: + - start_x: The relative fractional x-coordinate of the starting point to drag from, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - start_y: The relative fractional y-coordinate of the starting point to drag from, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - end_x: The relative fractional x-coordinate of the ending point to drag to, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - end_y: The relative fractional y-coordinate of the ending point to drag to, ranging from 0.0 to 1.0. The origin is the top-left corner of the application window. + - button: The mouse button to drag. One of 'left', 'right'. (Default: 'left') + [3] Example: drag_on_coordinates(start_x=0.1, start_y=0.1, end_x=0.9, end_y=0.9, button="left") + [4] Available control item: Control item is not required for this API. + [5] Return: None + + set_edit_text: summary: |- "set_edit_text" is to add new text to the control item. If there is already text in the control item, the new text will append to the end of the existing text. @@ -73,24 +108,26 @@ wheel_mouse_input: usage: |- [1] API call: wheel_mouse_input(wheel_dist: int) [2] Args: - - wheel_dist: The distance to scroll. Positive values indicate upward scrolling, negative values indicate downward scrolling. + - wheel_dist: The distance to scroll. Positive values indicate upward scrolling, negative values indicate downward scrolling. If you want to scroll to the end, you can use a large positive value, such as 10000. [3] Example: wheel_mouse_input(wheel_dist=-20) [4] All control items. [5] Return: None keyboard_input: summary: |- - "keyboard_input" is to simulate the keyboard input. You can use this API to simulate the keyboard input, such as shortcut keys, or any other keys that you want to input. + "keyboard_input" is to simulate the keyboard input, such as shortcut keys, or any other keys that you want to input. It can apply to any control item, or just type the keys in the application window without focusing on any control item. class_name: |- keyboardInputCommand usage: |- - [1] API call: keyboard_input(keys: str) + [1] API call: keyboard_input(keys: str, control_focus: bool = True) [2] Args: - keys: The key to input. It can be any key on the keyboard, with special keys represented by their virtual key codes. For example, "{VK_CONTROL}c" represents the Ctrl+C shortcut key. + - control_focus: Whether to focus on your selected control item before typing the keys. (Default: True) [3] Example: - keyboard_input(keys="{VK_CONTROL}c") --> Copy the selected text. - keyboard_input(keys="{ENTER}") --> Press the Enter key. - keyboard_input(keys="{TAB 2}") --> Press the Tab key twice. + - keyboard_input(keys="Hello World", control_focus=False) --> Type "Hello World" without focusing on any control item. [4] Available control item: All control items. [5] Return: None diff --git a/ufo/prompts/share/base/app_agent.yaml b/ufo/prompts/share/base/app_agent.yaml index 317065fc..201a6e7d 100644 --- a/ufo/prompts/share/base/app_agent.yaml +++ b/ufo/prompts/share/base/app_agent.yaml @@ -98,6 +98,7 @@ system: |- - You must use double-quoted string for the string arguments of your control Action. {{"text": "Hello World. \\n you're my friend. Tom's home is great.')"}}. Otherwise it will crash the system and destroy the user's computer. - You must stop and output "FINISH" in "Status" field in your response if you believe the task has finished or finished after the current action. - You must not do additional actions beyond the completion of the current user request. For example, if the user request is to open a new email window, you must stop and output FINISH in "Status" after you open the new email window. + - The Plan you provided are only for the future steps after the current action. You must not include the current action in the Plan. - You must check carefully on there are actions missing from the plan, given your previous plan, action history and the screenshots. If there are actions missing from the plan, you must remedy and take the missing action. For example, if the user request is to send an email, you must check carefully on whether all required information of the email is inputted. If not, you must input the missing information if you know what should input. - You must carefully observe analyze the screenshots and action history to see if some actions in the previous plan are redundant to completing current user request. If there are redundant actions, you must remove them from the plan and do not take the redundant actions. For instance, if the next action in the previous plan is to click the "New Email" button to open a new email window, but the new email editing window is already opened base on the screenshot, you must remove the action of clicking the "New Email" button from the plan and do not take it for the current action. - You must try your best to find the control item required for the next step in your previous plan on the current screenshot, and use the previous screenshots to examine whether the last action has taken effect and met your expectation. The more careful your observe and analyze, the more tip you will get. @@ -108,6 +109,7 @@ system: |- - Your output of SaveScreenshot must be strictly in the format of {{"save": True/False, "reason": "The reason for saving the screenshot"}}. Only set "save" to True if you strongly believe the screenshot is useful for the future steps, for example, the screenshot contains important information to fill in the form in the future steps. You must provide a reason for saving the screenshot in the "reason" field. - When inputting the searched text on Google, you must use the Search Box, which is a ComboBox type of control item. Do not use the address bar to input the searched text. - The 'Copilot' Add-in can help you with some special requests, such as creating a slide in PowerPoint from a Word document, or summarizing the entire ppt. + - If there are default values in a form, you have to check if the default values are correct and meet the user request. If the default values are not correct, you must change them by clicking the drop-down list (for ComboBox) to select the correct value, or inputting the correct values (for Edit or other control items that can input text). - Saving a ppt file into pdf format can be done by clicking the "Save As Adobe PDF" button. - You are given the help documents of the application or/and the online search results for completing the user request. You may use them to help you think about the next step and construct your planning. These information are for reference only, and may not be relevant, accurate or up-to-date. You must rely more on the current screenshots and control item list to make the decision. - The "UserConfirm" field in the action trajectory in the Blackboard is used to record the user's confirmation of the sensitive action. If the user confirms the action, the value of "UserConfirm" will be set to "Yes" and the action was executed. If the user does not confirm the action, the value of "UserConfirm" will be set to "No" and the action was not executed. @@ -200,6 +202,7 @@ system_nonvisual: |- - You must look at the control item list carefully, analyse the current status before you select the control item and take action on it. Base on the status of the application window, reflect on your previous plan for removing redundant actions or adding missing actions to complete the current user request. - You must use double-quoted string for the string arguments of your control Action. {{"text": "Hello World. \\n you're my friend. Tom's home is great.')"}}. Otherwise it will crash the system and destroy the user's computer. - You must stop and output "FINISH" in "Status" field in your response if you believe the task has finished or finished after the current action. + - The Plan you provided are only for the future steps after the current action. You must not include the current action in the Plan. - You must not do additional actions beyond the completion of the current user request. For example, if the user request is to open a new email window, you must stop and output FINISH in "Status" after you open the new email window. - You must check carefully on there are actions missing from the plan, given your previous plan and action history. If there are actions missing from the plan, you must remedy and take the missing action. For example, if the user request is to send an email, you must check carefully on whether all required information of the email is inputted. If not, you must input the missing information if you know what should input. - You must carefully check the control item list and action history to see if some actions in the previous plan are redundant to completing current user request. If there are redundant actions, you must remove them from the plan and do not take the redundant actions. For instance, if the next action in the previous plan is to click the "New Email" button to open a new email window, but the new email editing window is already opened base on the control item list, you must remove the action of clicking the "New Email" button from the plan and do not take it for the current action. @@ -208,6 +211,7 @@ system_nonvisual: |- - Try to locate and use the "Results" in the to complete the user request, such as adding these results along with information to meet the user request into SetText when composing a message, email or document, when necessary. For example, if the the user request need includes results from different applications, you must try to find them in previous "Results" and incorporate them into the message with other necessary text, not leaving them as placeholders. Make sure the text composed is integrated and meets the user request. - When inputting the searched text on Google, you must use the Search Box, which is a ComboBox type of control item. Do not use the address bar to input the searched text. - The 'Copilot' Add-in can help you with some special requests, such as creating a slide in PowerPoint from a Word document, or creating a presentation of a specific topic. + - If there are default values in a form, you have to check if the default values are correct and meet the user request. If the default values are not correct, you must change them by clicking the drop-down list (for ComboBox) to select the correct value, or inputting the correct values (for Edit or other control items that can input text). - You are given the help documents of the application or/and the online search results for completing the user request. You may use them to help you think about the next step and construct your planning. These information are for reference only, and may not be relevant, accurate or up-to-date. You must rely more on the current control item list to make the decision. - The "UserConfirm" field in the action trajectory in the Blackboard is used to record the user's confirmation of the sensitive action. If the user confirms the action, the value of "UserConfirm" will be set to "Yes" and the action was executed. If the user does not confirm the action, the value of "UserConfirm" will be set to "No" and the action was not executed. diff --git a/ufo/rag/retriever.py b/ufo/rag/retriever.py index 509951dd..ae1eb272 100644 --- a/ufo/rag/retriever.py +++ b/ufo/rag/retriever.py @@ -3,13 +3,11 @@ from abc import ABC, abstractmethod -from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from ufo.config.config import get_offline_learner_indexer_config from ufo.rag import web_search -from ufo.utils import print_with_color - +from ufo.utils import print_with_color, get_hugginface_embedding class RetrieverFactory: """ @@ -112,10 +110,7 @@ def get_indexer(self, path: str): return None try: - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.load_local(path, embeddings) + db = FAISS.load_local(path, get_hugginface_embedding()) return db except: print_with_color( @@ -146,10 +141,7 @@ def get_indexer(self, db_path: str): """ try: - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.load_local(db_path, embeddings) + db = FAISS.load_local(db_path, get_hugginface_embedding()) return db except: print_with_color( @@ -216,10 +208,7 @@ def get_indexer(self, db_path: str): """ try: - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.load_local(db_path, embeddings) + db = FAISS.load_local(db_path, get_hugginface_embedding()) return db except: print_with_color( diff --git a/ufo/rag/web_search.py b/ufo/rag/web_search.py index 23ac4225..c7c46f46 100644 --- a/ufo/rag/web_search.py +++ b/ufo/rag/web_search.py @@ -4,11 +4,10 @@ import requests from langchain.docstore.document import Document from langchain.text_splitter import HTMLHeaderTextSplitter -from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from ufo.config.config import Config -from ufo.utils import print_with_color +from ufo.utils import get_hugginface_embedding, print_with_color configs = Config.get_instance().config_data @@ -112,10 +111,7 @@ def create_indexer(self, documents: list): :param query: The query to create an indexer for. :return: The created indexer. """ - embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2" - ) - db = FAISS.from_documents(documents, embeddings) + db = FAISS.from_documents(documents, get_hugginface_embedding()) return db diff --git a/ufo/utils/__init__.py b/ufo/utils/__init__.py index 781bb4e5..d3e66b6a 100644 --- a/ufo/utils/__init__.py +++ b/ufo/utils/__init__.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import importlib +import functools import json import os from typing import Optional, Any, Dict @@ -147,3 +148,10 @@ def append_string_to_file(file_path: str, string: str) -> None: # Append the string to the file. with open(file_path, "a", encoding="utf-8") as file: file.write(string + "\n") + +@functools.lru_cache(maxsize=5) +def get_hugginface_embedding( + model_name: str = "sentence-transformers/all-mpnet-base-v2" +): + from langchain_community.embeddings import HuggingFaceEmbeddings + return HuggingFaceEmbeddings(model_name=model_name) \ No newline at end of file