diff --git a/ufo/config/config.py b/ufo/config/config.py index ab9f5166..db4654b0 100644 --- a/ufo/config/config.py +++ b/ufo/config/config.py @@ -69,7 +69,6 @@ def update_api_base(configs: dict, agent: str) -> None: :param configs: The configuration dictionary. :param agent: The agent name. """ - if configs[agent]["API_TYPE"].lower() == "aoai": if "deployments" not in configs[agent]["API_BASE"]: configs[agent]["API_BASE"] = ( diff --git a/ufo/eval/__init__.py b/ufo/eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ufo/eval/evaluate.py b/ufo/eval/evaluate.py new file mode 100644 index 00000000..93cc088b --- /dev/null +++ b/ufo/eval/evaluate.py @@ -0,0 +1,127 @@ +import json +import os +from ufo.llm.llm_call import get_completions +from ufo.automator.ui_control.screenshot import PhotographerFacade +from ufo.utils import json_parser + +class Evaluator(): + def __init__(self, log_path: str): + self.log_path = log_path + self.full_trajectory = [] + + def load_logs(self): + """ + Load logs from the log path. + """ + log_file_path = os.path.join(self.log_path, "response.log") + with open(log_file_path, "r") as f: + logs = f.readlines() + logs = [json.loads(log) for log in logs] + return logs + + def load_images(self): + """ + Load images from the log directory. + """ + # image_paths = glob.glob(self.log_path + "*.png") + init_image = os.path.join(self.log_path, "action_step1.png") + final_image = os.path.join(self.log_path, "action_step_final.png") + init_image_url = PhotographerFacade().encode_image_from_path(init_image) + final_image_url = PhotographerFacade().encode_image_from_path(final_image) + images = [init_image_url, final_image_url] + return images + + def get_xml(self): + """ + Get the xml. + """ + pass + + def take_final_screenshot(self): + """ + Take the final screenshot. + """ + pass + + def get_trajectory(self): + """ + Get the trajectory of the logs. + """ + logs = self.load_logs() + images = self.load_images() + for item in logs: + step_trajectory = { + "User Request": item["Request"], + "Step": item["Step"], + "Agent": item["Agent"], + "AgentName": item["AgentName"], + "Observation": item["Observation"], + "Thought": item["Thought"], + "ControlLabel": item["ControlLabel"], + "ControlText": item["ControlText"], + "Status": item["Status"], + "Plan": item["Plan"], + "Comment": item["Comment"], + "RoundStep": item["RoundStep"], + "AgentStep": item["AgentStep"], + "Round": item["Round"], + "Action": item["Action"], + "ActionType": item["ActionType"], + "Application": item["Application"] + } + self.full_trajectory.append( + {"type": "text", "text": str(step_trajectory)} + ) + [self.full_trajectory.append( + { + "type": "image_url", + "image_url": {"url": image} + } + ) for image in images] + + def __build_prompt(self): + """ + Build the prompt for the evaluation. + """ + system_instruction = """You're an evaluator who can evaluate whether an agent has successfully completed a task. The agent is an AI model that can interact with the desktop application and take actions. +You will be provided with a task and the execution trajectory of the agent, including the agent's thought, observation, plan, actions that have been taken, and etc. Besides, you will also be provided with the screenshot before starting the task and after the task is finished. +You are required to judge whether the agent has finished the task or not by observing the screenshot differences and the intermediate steps of the agent. The answer should be "yes" or "no" or "unsure". If you are not sure, please select "unsure". +Don't make up the answer, otherwise, very bad things will happen. +You should follow the below JSON format to your reply: +{ + "reason": "the reason why you identify the agent's output as correct or incorrect.", + "complete": "yes/no/unsure" +} +""" + messages = [{"role": "system", + "content": [{ + "type": "text", + "text": system_instruction + } + ] + }, + {"role": "user", + "content": self.full_trajectory + } + ] + return messages + + def evaluate(self): + """ + Evaluate the trajectory. + """ + self.get_trajectory() + messages = self.__build_prompt() + response_string, cost = get_completions(messages = messages, agent="appagent") + try: + response_json = json_parser(response_string[0]) + except: + response_json = None + return response_json, cost + +# For test +if __name__ == "__main__": + evaluator = Evaluator(log_path = "./logs/bbb/") + evaluator.evaluate() + + \ No newline at end of file diff --git a/ufo/module/basic.py b/ufo/module/basic.py index 42d5654d..d58d17cb 100644 --- a/ufo/module/basic.py +++ b/ufo/module/basic.py @@ -26,6 +26,7 @@ from ufo.agent.agent import AgentFactory, HostAgent from ufo.config.config import Config from ufo.experience.summarizer import ExperienceSummarizer +from ufo.automator.ui_control.screenshot import PhotographerFacade from ufo.module.processors.basic import BaseProcessor from ufo.module.state import ( ErrorState, @@ -34,6 +35,7 @@ Status, StatusToStateMapper, ) +from ufo.eval.evaluate import Evaluator configs = Config.get_instance().config_data @@ -238,7 +240,7 @@ class BaseSession(ABC): 6. At this point, the session will ask the user if they want to save the experience. If the user wants to save the experience, the session will save the experience and terminate. """ - def __init__(self, task: str) -> None: + def __init__(self, task: str, evaluate: bool) -> None: """ Initialize a session. :param task: The name of current task. @@ -246,6 +248,7 @@ def __init__(self, task: str) -> None: # Task-related properties self.task = task + self.evaluate = evaluate self._step = 0 self._round = 0 @@ -280,6 +283,10 @@ def __init__(self, task: str) -> None: self._cost = 0.0 self.control_reannotate = [] + # Evaluation-related properties + if self.evaluate: + self.evaluator = Evaluator(self.log_path) + @abstractmethod def create_round(self): """ @@ -458,6 +465,15 @@ def handle(self) -> None: """ self._state.handle(self) + def evaluation(self) -> None: + """ + Evaluate the session. + """ + result, cost = self.evaluator.evaluate() + self.update_cost(cost) + utils.print_with_color(f"Evaluation result: {result}", "magenta") + self.logger.info(f"{result}") + @property def session_type(self) -> str: """ @@ -466,6 +482,12 @@ def session_type(self) -> str: """ return self.__class__.__name__ + def capture_last_screenshot(self) -> None: + screenshot_save_path = self.log_path + f"action_step_final.png" + PhotographerFacade().capture_app_window_screenshot( + self.app_window, save_path=screenshot_save_path + ) + @staticmethod def initialize_logger(log_path: str, log_filename: str) -> logging.Logger: """ diff --git a/ufo/module/session.py b/ufo/module/session.py index a8c4dfbf..296bba2b 100644 --- a/ufo/module/session.py +++ b/ufo/module/session.py @@ -104,7 +104,7 @@ class SessionFactory: The factory class to create a session. """ - def create_session(self, task: str, mode: str, plan: str) -> BaseSession: + def create_session(self, task: str, mode: str, plan: str, evaluate: bool) -> BaseSession: """ Create a session. :param task: The name of current task. @@ -112,18 +112,18 @@ def create_session(self, task: str, mode: str, plan: str) -> BaseSession: :return: The created session. """ if mode == "normal": - return [Session(task)] + return [Session(task, evaluate)] elif mode == "follower": # If the plan is a folder, create a follower session for each plan file in the folder. if self.is_folder(plan): - return self.create_follower_session_in_batch(task, plan) + return self.create_follower_session_in_batch(task, plan, evaluate) else: - return [FollowerSession(task, plan)] + return [FollowerSession(task, plan, evaluate)] else: raise ValueError(f"The {mode} mode is not supported.") def create_follower_session_in_batch( - self, task: str, plan: str + self, task: str, plan: str, evaluate: bool ) -> List[BaseSession]: """ Create a follower session. @@ -134,7 +134,7 @@ def create_follower_session_in_batch( plan_files = self.get_plan_files(plan) file_names = [self.get_file_name_without_extension(f) for f in plan_files] sessions = [ - FollowerSession(f"{task}/{file_name}", plan_file) + FollowerSession(f"{task}/{file_name}", plan_file, evaluate) for file_name, plan_file in zip(file_names, plan_files) ] @@ -172,13 +172,13 @@ class Session(BaseSession): A session for UFO. """ - def __init__(self, task: str): + def __init__(self, task: str, evaluate: bool = False): """ Initialize a session. :param task: The name of current task. """ - super(Session, self).__init__(task) + super(Session, self).__init__(task, evaluate) # Initial setup and welcome message utils.print_with_color(interactor.WELCOME_TEXT, "cyan") @@ -259,14 +259,14 @@ class FollowerSession(Session): This session is used for the follower agent, which accepts a plan file to follow using the PlanReader. """ - def __init__(self, task: str, plan_file: str) -> None: + def __init__(self, task: str, plan_file: str, evaluate: bool) -> None: """ Initialize a session. :param task: The name of current task. :param plan_dir: The path of the plan file to follow. """ - super(Session, self).__init__(task) + super(Session, self).__init__(task, evaluate) self.plan_reader = PlanReader(plan_file) self.request = self.plan_reader.get_host_agent_request() diff --git a/ufo/module/state.py b/ufo/module/state.py index 9e50e4c9..53498f95 100644 --- a/ufo/module/state.py +++ b/ufo/module/state.py @@ -158,13 +158,18 @@ def handle(self, session: "Session") -> None: Handle the session. Finish the entire session, and save the experience if needed. :param session: The session. """ + # capture app screenshot after finishing the task + session.capture_last_screenshot() # Save the experience if needed, only for the normal session. if session.session_type == "Session": if experience_asker(): session.experience_saver() - session.set_state(NoneState()) + if session.evaluate: + session.set_state(EvaluationState()) + else: + session.set_state(NoneState()) class ErrorState(SessionState): @@ -264,4 +269,8 @@ def handle(self, session: "Session") -> None: Handle the session. Process the evaluation. :param session: The session. """ - pass + print_with_color("Start the evaluation", "yellow") + # evaluator = session.evaluator + # evaluator.evaluate() + session.evaluation() + session.set_state(NoneState()) \ No newline at end of file diff --git a/ufo/ufo.py b/ufo/ufo.py index d77a0f7f..910613c6 100644 --- a/ufo/ufo.py +++ b/ufo/ufo.py @@ -33,6 +33,13 @@ type=str, default="", ) +args.add_argument( + "--evaluate", + "-e", + help="The evaluation mode. If set to True, the system will run the evaluation after the execution is finished.", + default=False, + action=argparse.BooleanOptionalAction +) parsed_args = args.parse_args() @@ -48,7 +55,7 @@ def main(): python -m ufo -t task_name -m follower -p path_to_plan_file_or_folder """ sessions = SessionFactory().create_session( - task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan + task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan, evaluate=parsed_args.evaluate ) clients = UFOClientManager(sessions)