microsoft · vyokky · Sep 8, 2024 · Jun 22, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 <h1 align="center">
-    <b>UFO</b> <img src="./assets/ufo_blue.png" alt="UFO Image" width="40">: A <b>U</b>I-<b>F</b>ocused Agent for Windows <b>O</b>S Interaction
+    <b>UFO</b> <img src="./assets/ufo_blue.png" alt="UFO Image" width="40">: A <b>U</b>I-<b>Fo</b>cused Agent for Windows OS Interaction
 </h1>
 
 
@@ -35,6 +35,7 @@ Both agents leverage the multi-modal capabilities of GPT-Vision to comprehend th
 
 
 ## 📢 News
+- 📅 2024-09-08: We have a **New Release for v1.1.0!**, to allows UFO to click on any region of the application and reduces its latency by up tp 1/3!
 - 📅 2024-07-06: We have a **New Release for v1.0.0!**.  You can check out our [documentation](https://microsoft.github.io/UFO/). We welcome your contributions and feedback!
 - 📅 2024-06-28: We are thrilled to announce that our official introduction video is now available on [YouTube](https://www.youtube.com/watch?v=QT_OhygMVXU)!
 - 📅 2024-06-25: **New Release for v0.2.1!**  We are excited to announce the release of version 0.2.1! This update includes several new features and improvements:
@@ -52,7 +53,7 @@ Both agents leverage the multi-modal capabilities of GPT-Vision to comprehend th
     1. We now support creating your help documents for each Windows application to become an app expert. Check the [documentation](https://microsoft.github.io/UFO/creating_app_agent/help_document_provision/) for more details!
     2. UFO now supports RAG from offline documents and online Bing search.
     3. You can save the task completion trajectory into its memory for UFO's reference, improving its future success rate!
-    4. You can customize different GPT models for AppAgent and ActAgent. Text-only models (e.g., GPT-4) are now supported!
+    4. You can customize different GPT models for HostAgent and AppAgent. Text-only models (e.g., GPT-4) are now supported!
 - 📅 2024-02-14: Our [technical report](https://arxiv.org/abs/2402.07939) is online!
 - 📅 2024-02-10: UFO is released on GitHub🎈. Happy Chinese New year🐉!
 

diff --git a/documents/docs/automator/ui_automator.md b/documents/docs/automator/ui_automator.md
@@ -65,10 +65,15 @@ Below is the list of available commands in the UI Automator that are currently s
 | Command Name | Function Name | Description |
 |--------------|---------------|-------------|
 | `ClickInputCommand` | `click_input` | Click the control item with the mouse. |
+| `ClickOnCoordinatesCommand` | `click_on_coordinates` | Click on the specific fractional coordinates of the application window. |
+| `DragOnCoordinatesCommand` | `drag_on_coordinates` | Drag the mouse on the specific fractional coordinates of the application window. |
 | `SetEditTextCommand` | `set_edit_text` | Add new text to the control item. |
 | `GetTextsCommand` | `texts` | Get the text of the control item. |
 | `WheelMouseInputCommand` | `wheel_mouse_input` | Scroll the control item. |
 | `KeyboardInputCommand` | `keyboard_input` | Simulate the keyboard input. |
 
+!!! tip
+    Please refer to the `ufo/prompts/share/base/api.yaml` file for the detailed API documentation of the UI Automator.
+
 !!! tip
     You can customize the commands by adding new command classes to the `ufo/automator/ui_control/controller/ControlCommand` module.
diff --git a/documents/docs/automator/web_automator.md b/documents/docs/automator/web_automator.md
@@ -55,4 +55,8 @@ Below is the list of available commands in the Web Automator that are currently
 
 | Command Name | Function Name | Description |
 |--------------|---------------|-------------|
-| `WebCrawlerCommand` | `web_crawler` | Get the content of a web page into a markdown format. |
+| `WebCrawlerCommand` | `web_crawler` | Get the content of a web page into a markdown format. |
+
+
+!!! tip
+    Please refer to the `ufo/prompts/apps/web/api.yaml` file for the prompt details for the `WebCrawlerCommand` command.
diff --git a/documents/docs/automator/wincom_automator.md b/documents/docs/automator/wincom_automator.md
@@ -75,6 +75,9 @@ Below is the list of available commands in the API Automator that are currently
 | `InsertExcelTableCommand` | `insert_excel_table` | Insert a table to the Excel sheet. |
 
 
+!!! tip
+    Please refer to the `ufo/prompts/apps/{app_name}/api.yaml` file for the prompt details for the commands.
+
 !!! tip
     You can customize the commands by adding new command classes to the `ufo/automator/app_apis/{app_name}/` module.
 

diff --git a/learner/indexer.py b/learner/indexer.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+from ufo.utils import get_hugginface_embedding
 from . import xml_loader
 from .utils import load_json_file, save_json_file, print_with_color
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 import os
 
@@ -39,9 +39,7 @@ def create_indexer(app: str, docs: str, format: str, incremental: bool, save_pat
     )
 
     if format == "xml":
-        embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-mpnet-base-v2"
-        )
+        embeddings = get_hugginface_embedding()
     else:
         raise ValueError("Invalid format: " + format)
 

diff --git a/record_processor/summarizer/summarizer.py b/record_processor/summarizer/summarizer.py
@@ -6,7 +6,6 @@
 
 import yaml
 from langchain.docstore.document import Document
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 
 from record_processor.parser.demonstration_record import DemonstrationRecord
@@ -183,14 +182,11 @@ def create_or_update_vector_db(summaries: list, db_path: str):
             request = summary["request"]
             document_list.append(Document(page_content=request, metadata=summary))
 
-        embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-mpnet-base-v2"
-        )
-        db = FAISS.from_documents(document_list, embeddings)
+        db = FAISS.from_documents(document_list, get_hugginface_embedding())
 
         # Check if the db exists, if not, create a new one.
         if os.path.exists(db_path):
-            prev_db = FAISS.load_local(db_path, embeddings)
+            prev_db = FAISS.load_local(db_path, get_hugginface_embedding())
             db.merge_from(prev_db)
 
         db.save_local(db_path)

diff --git a/ufo/agents/agent/host_agent.py b/ufo/agents/agent/host_agent.py
@@ -261,7 +261,7 @@ def print_response(self, response_dict: Dict) -> None:
         subtask = response_dict.get("CurrentSubtask")
 
         # Convert the message from a list to a string.
-        message = list(response_dict.get("Message"))
+        message = list(response_dict.get("Message", ""))
         message = "\n".join(message)
 
         # Concatenate the subtask with the plan and convert the plan from a list to a string.

diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py
@@ -11,10 +11,12 @@
 
 from ufo import utils
 from ufo.agents.processors.basic import BaseProcessor
+from ufo.automator.ui_control.screenshot import PhotographerDecorator
 from ufo.automator.ui_control.control_filter import ControlFilterFactory
 from ufo.config.config import Config
 from ufo.module.context import Context, ContextNames
 
+
 if TYPE_CHECKING:
     from ufo.agents.agent.app_agent import AppAgent
 
@@ -77,6 +79,7 @@ def print_step_info(self) -> None:
             "magenta",
         )
 
+    @BaseProcessor.method_timer
     def capture_screenshot(self) -> None:
         """
         Capture the screenshot.
@@ -171,6 +174,7 @@ def capture_screenshot(self) -> None:
 
             self._save_to_xml()
 
+    @BaseProcessor.method_timer
     def get_control_info(self) -> None:
         """
         Get the control information.
@@ -191,6 +195,7 @@ def get_control_info(self) -> None:
             )
         )
 
+    @BaseProcessor.method_timer
     def get_prompt_message(self) -> None:
         """
         Get the prompt message for the AppAgent.
@@ -232,6 +237,7 @@ def get_prompt_message(self) -> None:
         )
         self.request_logger.debug(log)
 
+    @BaseProcessor.method_timer
     def get_response(self) -> None:
         """
         Get the response from the LLM.
@@ -247,6 +253,7 @@ def get_response(self) -> None:
             self.llm_error_handler()
             return
 
+    @BaseProcessor.method_timer
     def parse_response(self) -> None:
         """
         Parse the response.
@@ -277,38 +284,59 @@ def parse_response(self) -> None:
         self.status = self._response_json.get("Status", "")
         self.app_agent.print_response(self._response_json)
 
+    @BaseProcessor.method_timer
     def execute_action(self) -> None:
         """
         Execute the action.
         """
 
+        control_selected = self._annotation_dict.get(self._control_label, "")
+
         try:
             # Get the selected control item from the annotation dictionary and LLM response.
             # The LLM response is a number index corresponding to the key in the annotation dictionary.
-            control_selected = self._annotation_dict.get(self._control_label, "")
 
             if control_selected:
-                control_selected.draw_outline(colour="red", thickness=3)
-                time.sleep(configs.get("RECTANGLE_TIME", 0))
 
-            self.app_agent.Puppeteer.receiver_manager.create_ui_control_receiver(
-                control_selected, self.application_window
-            )
+                if configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True):
+                    control_selected.draw_outline(colour="red", thickness=3)
+                    time.sleep(configs.get("RECTANGLE_TIME", 0))
 
-            # Save the screenshot of the tagged selected control.
-            self.capture_control_screenshot(control_selected)
+                control_coordinates = PhotographerDecorator.coordinate_adjusted(
+                    self.application_window.rectangle(), control_selected.rectangle()
+                )
 
-            if self.status.upper() == self._agent_status_manager.SCREENSHOT.value:
-                self.handle_screenshot_status()
-            else:
-                self._results = self.app_agent.Puppeteer.execute_command(
-                    self._operation, self._args
+                self._control_log = {
+                    "control_class": control_selected.element_info.class_name,
+                    "control_type": control_selected.element_info.control_type,
+                    "control_automation_id": control_selected.element_info.automation_id,
+                    "control_friendly_class_name": control_selected.friendly_class_name(),
+                    "control_coordinates": {
+                        "left": control_coordinates[0],
+                        "top": control_coordinates[1],
+                        "right": control_coordinates[2],
+                        "bottom": control_coordinates[3],
+                    },
+                }
+
+                self.app_agent.Puppeteer.receiver_manager.create_ui_control_receiver(
+                    control_selected, self.application_window
                 )
-                self.control_reannotate = None
-            if not utils.is_json_serializable(self._results):
-                self._results = ""
 
-                return
+                # Save the screenshot of the tagged selected control.
+                self.capture_control_screenshot(control_selected)
+
+                if self.status.upper() == self._agent_status_manager.SCREENSHOT.value:
+                    self.handle_screenshot_status()
+                else:
+                    self._results = self.app_agent.Puppeteer.execute_command(
+                        self._operation, self._args
+                    )
+                    self.control_reannotate = None
+                if not utils.is_json_serializable(self._results):
+                    self._results = ""
+
+                    return
 
         except Exception:
             self.general_error_handler()
@@ -365,14 +393,16 @@ def update_memory(self) -> None:
             "Action": self.action,
             "ActionType": self.app_agent.Puppeteer.get_command_types(self._operation),
             "Request": self.request,
-            "Agent": "ActAgent",
+            "Agent": "AppAgent",
             "AgentName": self.app_agent.name,
             "Application": app_root,
             "Cost": self._cost,
             "Results": self._results,
         }
         self._memory_data.set_values_from_dict(self._response_json)
         self._memory_data.set_values_from_dict(additional_memory)
+        self._memory_data.set_values_from_dict(self._control_log)
+        self._memory_data.set_values_from_dict({"time_cost": self._time_cost})
 
         if self.status.upper() == self._agent_status_manager.CONFIRM.value:
             self._memory_data.set_values_from_dict({"UserConfirm": "Yes"})
@@ -381,7 +411,7 @@ def update_memory(self) -> None:
 
         # Log the memory item.
         self.context.add_to_structural_logs(self._memory_data.to_dict())
-        self.log(self._memory_data.to_dict())
+        # self.log(self._memory_data.to_dict())
 
         # Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps.
         memorized_action = {

diff --git a/ufo/agents/processors/basic.py b/ufo/agents/processors/basic.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT License.
 
 
+from functools import wraps
 import json
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import List
+from typing import Any, List
 
 from pywinauto.controls.uiawrapper import UIAWrapper
 
@@ -57,6 +58,15 @@ def __init__(self, agent: BasicAgent, context: Context) -> None:
         self._action = None
         self._plan = None
 
+        self._control_log = {
+            "control_class": None,
+            "control_type": None,
+            "control_automation_id": None,
+        }
+
+        self._total_time_cost = 0
+        self._time_cost = {}
+
     def process(self) -> None:
         """
         Process a single step in a round.
@@ -71,9 +81,11 @@ def process(self) -> None:
         8. Execute the action.
         9. Update the memory.
         10. Update the step and status.
-        11. Update the step.
+        11. Save the log.
         """
 
+        start_time = time.time()
+
         # Step 1: Print the step information.
         self.print_step_info()
 
@@ -115,8 +127,10 @@ def process(self) -> None:
         # Step 10: Update the status.
         self.update_status()
 
-        # Step 11: Update the context.
-        self.update_step()
+        self._total_time_cost = time.time() - start_time
+
+        # Step 11: Save the log.
+        self.log_save()
 
     def resume(self) -> None:
         """
@@ -139,6 +153,24 @@ def resume(self) -> None:
 
         self._is_resumed = False
 
+    @classmethod
+    def method_timer(cls, func):
+        """
+        Decorator to calculate the time cost of the method.
+        :param func: The method to be decorated.
+        :return: The decorated method.
+        """
+
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            start_time = time.time()
+            result = func(self, *args, **kwargs)
+            end_time = time.time()
+            self._time_cost[func.__name__] = end_time - start_time
+            return result
+
+        return wrapper
+
     @abstractmethod
     def print_step_info(self) -> None:
         """
@@ -205,6 +237,19 @@ def update_status(self) -> None:
         if self.status != self._agent_status_manager.FINISH.value:
             time.sleep(configs["SLEEP_TIME"])
 
+        self.round_step += 1
+        self.session_step += 1
+
+    def log_save(self) -> None:
+        """
+        Save the log.
+        """
+
+        self._memory_data.set_values_from_dict(
+            {"total_time_cost": self._total_time_cost}
+        )
+        self.log(self._memory_data.to_dict())
+
     @property
     def context(self) -> Context:
         """
@@ -221,14 +266,6 @@ def update_cost(self) -> None:
         self.round_cost += self.cost
         self.session_cost += self.cost
 
-    def update_step(self) -> None:
-        """
-        Update the step.
-        """
-
-        self.round_step += 1
-        self.session_step += 1
-
     @property
     def agent(self) -> BasicAgent:
         """
@@ -684,7 +721,7 @@ def llm_error_handler(self) -> None:
         return
 
     @staticmethod
-    def string2list(string: str) -> List[str]:
+    def string2list(string: Any) -> List[str]:
         """
         Convert a string to a list of string if the input is a string.
         :param string: The string.