microsoft · vyokky · May 11, 2024 · May 10, 2024
diff --git a/ufo/config/config.py b/ufo/config/config.py
@@ -69,7 +69,6 @@ def update_api_base(configs: dict, agent: str) -> None:
         :param configs: The configuration dictionary.
         :param agent: The agent name.
         """
-
         if configs[agent]["API_TYPE"].lower() == "aoai":
             if "deployments" not in configs[agent]["API_BASE"]:
                 configs[agent]["API_BASE"] = (

diff --git a/ufo/eval/__init__.py b/ufo/eval/__init__.py
diff --git a/ufo/eval/evaluate.py b/ufo/eval/evaluate.py
@@ -0,0 +1,127 @@
+import json
+import os
+from ufo.llm.llm_call import get_completions
+from ufo.automator.ui_control.screenshot import PhotographerFacade
+from ufo.utils import json_parser
+
+class Evaluator():
+    def __init__(self, log_path: str):
+        self.log_path = log_path
+        self.full_trajectory = []
+
+    def load_logs(self):
+        """
+        Load logs from the log path.
+        """
+        log_file_path = os.path.join(self.log_path, "response.log")
+        with open(log_file_path, "r") as f:
+            logs = f.readlines()
+            logs = [json.loads(log) for log in logs]
+        return logs
+
+    def load_images(self):
+        """
+        Load images from the log directory.
+        """
+        # image_paths = glob.glob(self.log_path + "*.png")
+        init_image = os.path.join(self.log_path, "action_step1.png")
+        final_image = os.path.join(self.log_path, "action_step_final.png")
+        init_image_url = PhotographerFacade().encode_image_from_path(init_image)
+        final_image_url = PhotographerFacade().encode_image_from_path(final_image)
+        images = [init_image_url, final_image_url]
+        return images
+
+    def get_xml(self):
+        """
+        Get the xml.
+        """
+        pass
+
+    def take_final_screenshot(self):
+        """
+        Take the final screenshot.
+        """
+        pass
+
+    def get_trajectory(self):
+        """
+        Get the trajectory of the logs.
+        """
+        logs = self.load_logs()
+        images = self.load_images()
+        for item in logs:
+            step_trajectory = {
+                            "User Request": item["Request"], 
+                            "Step": item["Step"],
+                            "Agent": item["Agent"],
+                            "AgentName": item["AgentName"],
+                            "Observation": item["Observation"],
+                            "Thought": item["Thought"],
+                            "ControlLabel": item["ControlLabel"],
+                            "ControlText":  item["ControlText"],
+                            "Status": item["Status"],
+                            "Plan": item["Plan"],
+                            "Comment": item["Comment"],
+                            "RoundStep": item["RoundStep"],
+                            "AgentStep": item["AgentStep"],
+                            "Round": item["Round"],
+                            "Action": item["Action"],
+                            "ActionType": item["ActionType"],
+                            "Application": item["Application"]
+                            }
+            self.full_trajectory.append(
+                {"type": "text", "text": str(step_trajectory)}
+            )
+        [self.full_trajectory.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": image}
+            }
+        ) for image in images]
+
+    def __build_prompt(self):
+        """
+        Build the prompt for the evaluation.
+        """
+        system_instruction = """You're an evaluator who can evaluate whether an agent has successfully completed a task. The agent is an AI model that can interact with the desktop application and take actions. 
+You will be provided with a task and the execution trajectory of the agent, including the agent's thought, observation, plan, actions that have been taken, and etc. Besides, you will also be provided with the screenshot before starting the task and after the task is finished. 
+You are required to judge whether the agent has finished the task or not by observing the screenshot differences and the intermediate steps of the agent. The answer should be "yes" or "no" or "unsure". If you are not sure, please select "unsure". 
+Don't make up the answer, otherwise, very bad things will happen.
+You should follow the below JSON format to your reply:
+{
+    "reason": "the reason why you identify the agent's output as correct or incorrect.",
+    "complete": "yes/no/unsure"
+}
+"""
+        messages = [{"role": "system", 
+                     "content": [{
+                            "type": "text",
+                            "text": system_instruction
+                                }
+                    ]
+                    },
+                    {"role": "user", 
+                    "content": self.full_trajectory
+                    }
+                    ]
+        return messages
+
+    def evaluate(self):
+        """
+        Evaluate the trajectory.
+        """
+        self.get_trajectory()
+        messages = self.__build_prompt()
+        response_string, cost = get_completions(messages = messages, agent="appagent")
+        try:
+            response_json = json_parser(response_string[0])
+        except:
+            response_json = None
+        return response_json, cost
+
+# For test
+if __name__ == "__main__":
+    evaluator = Evaluator(log_path = "./logs/bbb/")
+    evaluator.evaluate()
+
+
diff --git a/ufo/module/basic.py b/ufo/module/basic.py
@@ -26,6 +26,7 @@
 from ufo.agent.agent import AgentFactory, HostAgent
 from ufo.config.config import Config
 from ufo.experience.summarizer import ExperienceSummarizer
+from ufo.automator.ui_control.screenshot import PhotographerFacade
 from ufo.module.processors.basic import BaseProcessor
 from ufo.module.state import (
     ErrorState,
@@ -34,6 +35,7 @@
     Status,
     StatusToStateMapper,
 )
+from ufo.eval.evaluate import Evaluator
 
 configs = Config.get_instance().config_data
 
@@ -238,14 +240,15 @@ class BaseSession(ABC):
     6. At this point, the session will ask the user if they want to save the experience. If the user wants to save the experience, the session will save the experience and terminate.
     """
 
-    def __init__(self, task: str) -> None:
+    def __init__(self, task: str, evaluate: bool) -> None:
         """
         Initialize a session.
         :param task: The name of current task.
         """
 
         # Task-related properties
         self.task = task
+        self.evaluate = evaluate
         self._step = 0
         self._round = 0
 
@@ -280,6 +283,10 @@ def __init__(self, task: str) -> None:
         self._cost = 0.0
         self.control_reannotate = []
 
+        # Evaluation-related properties
+        if self.evaluate:
+            self.evaluator = Evaluator(self.log_path)
+
     @abstractmethod
     def create_round(self):
         """
@@ -458,6 +465,15 @@ def handle(self) -> None:
         """
         self._state.handle(self)
 
+    def evaluation(self) -> None:
+        """
+        Evaluate the session.
+        """
+        result, cost = self.evaluator.evaluate()
+        self.update_cost(cost)
+        utils.print_with_color(f"Evaluation result: {result}", "magenta")
+        self.logger.info(f"{result}")
+
     @property
     def session_type(self) -> str:
         """
@@ -466,6 +482,12 @@ def session_type(self) -> str:
         """
         return self.__class__.__name__
 
+    def capture_last_screenshot(self) -> None:
+        screenshot_save_path = self.log_path + f"action_step_final.png"
+        PhotographerFacade().capture_app_window_screenshot(
+                    self.app_window, save_path=screenshot_save_path
+                )
+
     @staticmethod
     def initialize_logger(log_path: str, log_filename: str) -> logging.Logger:
         """

diff --git a/ufo/module/session.py b/ufo/module/session.py
@@ -104,26 +104,26 @@ class SessionFactory:
     The factory class to create a session.
     """
 
-    def create_session(self, task: str, mode: str, plan: str) -> BaseSession:
+    def create_session(self, task: str, mode: str, plan: str, evaluate: bool) -> BaseSession:
         """
         Create a session.
         :param task: The name of current task.
         :param mode: The mode of the task.
         :return: The created session.
         """
         if mode == "normal":
-            return [Session(task)]
+            return [Session(task, evaluate)]
         elif mode == "follower":
             # If the plan is a folder, create a follower session for each plan file in the folder.
             if self.is_folder(plan):
-                return self.create_follower_session_in_batch(task, plan)
+                return self.create_follower_session_in_batch(task, plan, evaluate)
             else:
-                return [FollowerSession(task, plan)]
+                return [FollowerSession(task, plan, evaluate)]
         else:
             raise ValueError(f"The {mode} mode is not supported.")
 
     def create_follower_session_in_batch(
-        self, task: str, plan: str
+        self, task: str, plan: str, evaluate: bool
     ) -> List[BaseSession]:
         """
         Create a follower session.
@@ -134,7 +134,7 @@ def create_follower_session_in_batch(
         plan_files = self.get_plan_files(plan)
         file_names = [self.get_file_name_without_extension(f) for f in plan_files]
         sessions = [
-            FollowerSession(f"{task}/{file_name}", plan_file)
+            FollowerSession(f"{task}/{file_name}", plan_file, evaluate)
             for file_name, plan_file in zip(file_names, plan_files)
         ]
 
@@ -172,13 +172,13 @@ class Session(BaseSession):
     A session for UFO.
     """
 
-    def __init__(self, task: str):
+    def __init__(self, task: str, evaluate: bool = False):
         """
         Initialize a session.
         :param task: The name of current task.
         """
 
-        super(Session, self).__init__(task)
+        super(Session, self).__init__(task, evaluate)
 
         # Initial setup and welcome message
         utils.print_with_color(interactor.WELCOME_TEXT, "cyan")
@@ -259,14 +259,14 @@ class FollowerSession(Session):
     This session is used for the follower agent, which accepts a plan file to follow using the PlanReader.
     """
 
-    def __init__(self, task: str, plan_file: str) -> None:
+    def __init__(self, task: str, plan_file: str, evaluate: bool) -> None:
         """
         Initialize a session.
         :param task: The name of current task.
         :param plan_dir: The path of the plan file to follow.
         """
 
-        super(Session, self).__init__(task)
+        super(Session, self).__init__(task, evaluate)
 
         self.plan_reader = PlanReader(plan_file)
         self.request = self.plan_reader.get_host_agent_request()

diff --git a/ufo/module/state.py b/ufo/module/state.py
@@ -158,13 +158,18 @@ def handle(self, session: "Session") -> None:
         Handle the session. Finish the entire session, and save the experience if needed.
         :param session: The session.
         """
+        # capture app screenshot after finishing the task 
+        session.capture_last_screenshot()
 
         # Save the experience if needed, only for the normal session.
         if session.session_type == "Session":
             if experience_asker():
                 session.experience_saver()
 
-        session.set_state(NoneState())
+        if session.evaluate:
+            session.set_state(EvaluationState())
+        else:
+            session.set_state(NoneState())
 
 
 class ErrorState(SessionState):
@@ -264,4 +269,8 @@ def handle(self, session: "Session") -> None:
         Handle the session. Process the evaluation.
         :param session: The session.
         """
-        pass
+        print_with_color("Start the evaluation", "yellow")
+        # evaluator = session.evaluator
+        # evaluator.evaluate()
+        session.evaluation()
+        session.set_state(NoneState())
diff --git a/ufo/ufo.py b/ufo/ufo.py
@@ -33,6 +33,13 @@
     type=str,
     default="",
 )
+args.add_argument(
+    "--evaluate",
+    "-e",
+    help="The evaluation mode. If set to True, the system will run the evaluation after the execution is finished.",
+    default=False, 
+    action=argparse.BooleanOptionalAction
+)
 
 parsed_args = args.parse_args()
 
@@ -48,7 +55,7 @@ def main():
     python -m ufo -t task_name -m follower -p path_to_plan_file_or_folder
     """
     sessions = SessionFactory().create_session(
-        task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan
+        task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan, evaluate=parsed_args.evaluate
     )
 
     clients = UFOClientManager(sessions)