Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add evaluation #74

Merged
merged 1 commit into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ufo/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def update_api_base(configs: dict, agent: str) -> None:
:param configs: The configuration dictionary.
:param agent: The agent name.
"""

if configs[agent]["API_TYPE"].lower() == "aoai":
if "deployments" not in configs[agent]["API_BASE"]:
configs[agent]["API_BASE"] = (
Expand Down
Empty file added ufo/eval/__init__.py
Empty file.
127 changes: 127 additions & 0 deletions ufo/eval/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
import os
from ufo.llm.llm_call import get_completions
from ufo.automator.ui_control.screenshot import PhotographerFacade
from ufo.utils import json_parser

class Evaluator():
def __init__(self, log_path: str):
self.log_path = log_path
self.full_trajectory = []

def load_logs(self):
"""
Load logs from the log path.
"""
log_file_path = os.path.join(self.log_path, "response.log")
with open(log_file_path, "r") as f:
logs = f.readlines()
logs = [json.loads(log) for log in logs]
return logs

def load_images(self):
"""
Load images from the log directory.
"""
# image_paths = glob.glob(self.log_path + "*.png")
init_image = os.path.join(self.log_path, "action_step1.png")
final_image = os.path.join(self.log_path, "action_step_final.png")
init_image_url = PhotographerFacade().encode_image_from_path(init_image)
final_image_url = PhotographerFacade().encode_image_from_path(final_image)
images = [init_image_url, final_image_url]
return images

def get_xml(self):
"""
Get the xml.
"""
pass

def take_final_screenshot(self):
"""
Take the final screenshot.
"""
pass

def get_trajectory(self):
"""
Get the trajectory of the logs.
"""
logs = self.load_logs()
images = self.load_images()
for item in logs:
step_trajectory = {
"User Request": item["Request"],
"Step": item["Step"],
"Agent": item["Agent"],
"AgentName": item["AgentName"],
"Observation": item["Observation"],
"Thought": item["Thought"],
"ControlLabel": item["ControlLabel"],
"ControlText": item["ControlText"],
"Status": item["Status"],
"Plan": item["Plan"],
"Comment": item["Comment"],
"RoundStep": item["RoundStep"],
"AgentStep": item["AgentStep"],
"Round": item["Round"],
"Action": item["Action"],
"ActionType": item["ActionType"],
"Application": item["Application"]
}
self.full_trajectory.append(
{"type": "text", "text": str(step_trajectory)}
)
[self.full_trajectory.append(
{
"type": "image_url",
"image_url": {"url": image}
}
) for image in images]

def __build_prompt(self):
"""
Build the prompt for the evaluation.
"""
system_instruction = """You're an evaluator who can evaluate whether an agent has successfully completed a task. The agent is an AI model that can interact with the desktop application and take actions.
You will be provided with a task and the execution trajectory of the agent, including the agent's thought, observation, plan, actions that have been taken, and etc. Besides, you will also be provided with the screenshot before starting the task and after the task is finished.
You are required to judge whether the agent has finished the task or not by observing the screenshot differences and the intermediate steps of the agent. The answer should be "yes" or "no" or "unsure". If you are not sure, please select "unsure".
Don't make up the answer, otherwise, very bad things will happen.
You should follow the below JSON format to your reply:
{
"reason": "the reason why you identify the agent's output as correct or incorrect.",
"complete": "yes/no/unsure"
}
"""
messages = [{"role": "system",
"content": [{
"type": "text",
"text": system_instruction
}
]
},
{"role": "user",
"content": self.full_trajectory
}
]
return messages

def evaluate(self):
"""
Evaluate the trajectory.
"""
self.get_trajectory()
messages = self.__build_prompt()
response_string, cost = get_completions(messages = messages, agent="appagent")
try:
response_json = json_parser(response_string[0])
except:
response_json = None
return response_json, cost

# For test
if __name__ == "__main__":
evaluator = Evaluator(log_path = "./logs/bbb/")
evaluator.evaluate()


24 changes: 23 additions & 1 deletion ufo/module/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ufo.agent.agent import AgentFactory, HostAgent
from ufo.config.config import Config
from ufo.experience.summarizer import ExperienceSummarizer
from ufo.automator.ui_control.screenshot import PhotographerFacade
from ufo.module.processors.basic import BaseProcessor
from ufo.module.state import (
ErrorState,
Expand All @@ -34,6 +35,7 @@
Status,
StatusToStateMapper,
)
from ufo.eval.evaluate import Evaluator

configs = Config.get_instance().config_data

Expand Down Expand Up @@ -238,14 +240,15 @@ class BaseSession(ABC):
6. At this point, the session will ask the user if they want to save the experience. If the user wants to save the experience, the session will save the experience and terminate.
"""

def __init__(self, task: str) -> None:
def __init__(self, task: str, evaluate: bool) -> None:
"""
Initialize a session.
:param task: The name of current task.
"""

# Task-related properties
self.task = task
self.evaluate = evaluate
self._step = 0
self._round = 0

Expand Down Expand Up @@ -280,6 +283,10 @@ def __init__(self, task: str) -> None:
self._cost = 0.0
self.control_reannotate = []

# Evaluation-related properties
if self.evaluate:
self.evaluator = Evaluator(self.log_path)

@abstractmethod
def create_round(self):
"""
Expand Down Expand Up @@ -458,6 +465,15 @@ def handle(self) -> None:
"""
self._state.handle(self)

def evaluation(self) -> None:
"""
Evaluate the session.
"""
result, cost = self.evaluator.evaluate()
self.update_cost(cost)
utils.print_with_color(f"Evaluation result: {result}", "magenta")
self.logger.info(f"{result}")

@property
def session_type(self) -> str:
"""
Expand All @@ -466,6 +482,12 @@ def session_type(self) -> str:
"""
return self.__class__.__name__

def capture_last_screenshot(self) -> None:
screenshot_save_path = self.log_path + f"action_step_final.png"
PhotographerFacade().capture_app_window_screenshot(
self.app_window, save_path=screenshot_save_path
)

@staticmethod
def initialize_logger(log_path: str, log_filename: str) -> logging.Logger:
"""
Expand Down
20 changes: 10 additions & 10 deletions ufo/module/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,26 +104,26 @@ class SessionFactory:
The factory class to create a session.
"""

def create_session(self, task: str, mode: str, plan: str) -> BaseSession:
def create_session(self, task: str, mode: str, plan: str, evaluate: bool) -> BaseSession:
"""
Create a session.
:param task: The name of current task.
:param mode: The mode of the task.
:return: The created session.
"""
if mode == "normal":
return [Session(task)]
return [Session(task, evaluate)]
elif mode == "follower":
# If the plan is a folder, create a follower session for each plan file in the folder.
if self.is_folder(plan):
return self.create_follower_session_in_batch(task, plan)
return self.create_follower_session_in_batch(task, plan, evaluate)
else:
return [FollowerSession(task, plan)]
return [FollowerSession(task, plan, evaluate)]
else:
raise ValueError(f"The {mode} mode is not supported.")

def create_follower_session_in_batch(
self, task: str, plan: str
self, task: str, plan: str, evaluate: bool
) -> List[BaseSession]:
"""
Create a follower session.
Expand All @@ -134,7 +134,7 @@ def create_follower_session_in_batch(
plan_files = self.get_plan_files(plan)
file_names = [self.get_file_name_without_extension(f) for f in plan_files]
sessions = [
FollowerSession(f"{task}/{file_name}", plan_file)
FollowerSession(f"{task}/{file_name}", plan_file, evaluate)
for file_name, plan_file in zip(file_names, plan_files)
]

Expand Down Expand Up @@ -172,13 +172,13 @@ class Session(BaseSession):
A session for UFO.
"""

def __init__(self, task: str):
def __init__(self, task: str, evaluate: bool = False):
"""
Initialize a session.
:param task: The name of current task.
"""

super(Session, self).__init__(task)
super(Session, self).__init__(task, evaluate)

# Initial setup and welcome message
utils.print_with_color(interactor.WELCOME_TEXT, "cyan")
Expand Down Expand Up @@ -259,14 +259,14 @@ class FollowerSession(Session):
This session is used for the follower agent, which accepts a plan file to follow using the PlanReader.
"""

def __init__(self, task: str, plan_file: str) -> None:
def __init__(self, task: str, plan_file: str, evaluate: bool) -> None:
"""
Initialize a session.
:param task: The name of current task.
:param plan_dir: The path of the plan file to follow.
"""

super(Session, self).__init__(task)
super(Session, self).__init__(task, evaluate)

self.plan_reader = PlanReader(plan_file)
self.request = self.plan_reader.get_host_agent_request()
Expand Down
13 changes: 11 additions & 2 deletions ufo/module/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,18 @@ def handle(self, session: "Session") -> None:
Handle the session. Finish the entire session, and save the experience if needed.
:param session: The session.
"""
# capture app screenshot after finishing the task
session.capture_last_screenshot()

# Save the experience if needed, only for the normal session.
if session.session_type == "Session":
if experience_asker():
session.experience_saver()

session.set_state(NoneState())
if session.evaluate:
session.set_state(EvaluationState())
else:
session.set_state(NoneState())


class ErrorState(SessionState):
Expand Down Expand Up @@ -264,4 +269,8 @@ def handle(self, session: "Session") -> None:
Handle the session. Process the evaluation.
:param session: The session.
"""
pass
print_with_color("Start the evaluation", "yellow")
# evaluator = session.evaluator
# evaluator.evaluate()
session.evaluation()
session.set_state(NoneState())
9 changes: 8 additions & 1 deletion ufo/ufo.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@
type=str,
default="",
)
args.add_argument(
"--evaluate",
"-e",
help="The evaluation mode. If set to True, the system will run the evaluation after the execution is finished.",
default=False,
action=argparse.BooleanOptionalAction
)

parsed_args = args.parse_args()

Expand All @@ -48,7 +55,7 @@ def main():
python -m ufo -t task_name -m follower -p path_to_plan_file_or_folder
"""
sessions = SessionFactory().create_session(
task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan
task=parsed_args.task, mode=parsed_args.mode, plan=parsed_args.plan, evaluate=parsed_args.evaluate
)

clients = UFOClientManager(sessions)
Expand Down