Skip to content

Commit

Permalink
troubleshoot notification service by multiple agents
Browse files Browse the repository at this point in the history
  • Loading branch information
shlomsh committed Sep 4, 2023
1 parent becea38 commit 6038aaa
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 23 deletions.
4 changes: 4 additions & 0 deletions data/demo/logs/node-ui-service-k8s-logs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
August 31, 2023 1:27:12 node-01 kubelet[12345]: I0628 1:27:12.456789 node-01 kubelet[12345]: OOMKilling POD uid: "b1234c56-d78e-11e9-8a28-0242ac110002", name: "node-ui-service-pod", namespace: "my-namespace", container: "node-ui-service-container", Memory cgroup out of memory: Kill process 5678 (node-ui-service-container) score 1000 or sacrifice child
August 31, 2023 1:27:12 node-01 kernel: [345678.123456] Memory cgroup out of memory: Kill process 5678 (node-ui-service-container) score 1000 or sacrifice child
August 31, 2023 1:27:12 node-01 kernel: [345678.123457] Killed process 5678 (node-ui-service-container) total-vm:123456kB, anon-rss:12345kB, file-rss:0kB, shmem-rss:0kB
August 31, 2023 1:27:12 node-01 kubelet[12345]: I0628 1:27:12.456799 node-01 kubelet[12345]: pod "node-ui-service-pod_my-namespace(b1234c56-d78e-11e9-8a28-0242ac110002)" failed due to OOM Killer.
4 changes: 0 additions & 4 deletions data/demo/logs/notification-service-k8s-logs.txt

This file was deleted.

142 changes: 135 additions & 7 deletions genia/llm_function/agent_skill_function.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
from collections import deque
import json
import logging
import re

from traitlets import Any
from genia.agents.open_ai import OpenAIToolsEmpoweredAgent
from genia.conversation.llm_conversation import LLMConversation

from genia.llm_function.llm_function import LLMFunction
from genia.llm_function.llm_function_repository import LLMFunctionRepository
from genia.llm_function.llm_skill_function import SkillFunction
from genia.llm_function.open_api_function import OpenApiFunction
from genia.llm_function.python_function import PythonFunction
from genia.llm_function.url_function import URLFunction
from genia.llm_function_lookup_strategy.llm_function_lookup_strategy import LLMFunctionLookupStrategy
from genia.settings_loader import settings
from genia.utils.utils import safe_loads


class AgentSkillFunction(LLMFunction):
Expand All @@ -32,21 +40,141 @@ def __init__(

def evaluate(self, function_config: dict, parameters: dict) -> Any:
tool_name = function_config["tool_name"]
skill = self._function_repository.find_skill_by_name(tool_name)
# i.e agent: SRE
agent_name = function_config["agent"]
self.logger.debug(f"AgentSkillFunction: {agent_name} {tool_name}")
special_agent_prompt = ""
agents_ctx = []
tasks_list = deque()

new_tasks = self.planner_agent(tool_name, agent_name)
for new_task in new_tasks:
task_parts = new_task["task"].strip().split(".", 1)
if len(task_parts) == 2:
task_id = task_parts[0].strip()
task_name = task_parts[1].strip()
tasks_list.append({"task_id": task_id, "task": task_name})

while len(tasks_list) > 0:
new_task = tasks_list.popleft()
# Send to execution function to complete the task based on the context
agents_ctx.append({"role": "user", "content": new_task["task"]})
result = self.execution_agent(agents_ctx, agent_name)
agents_ctx.append({"role": "assistant", "content": result})

return result

def execution_agent(self, agents_ctx, agent_name):
messages = []
execution_agent_prompt = settings["execution_agent_prompt"]["system"]
if settings[agent_name + "_agent_prompt"] is not None:
messages.append({"role": "system", "content": settings[agent_name + "_agent_prompt"]["system"]})

messages.append({"role": "system", "content": execution_agent_prompt})
messages.extend(agents_ctx)

functions = self._function_lookup_strategy.find_potential_tools(self._llm_conversation)
return self.call_model(messages, functions, "auto")

def call_model(self, messages, functions, mode):
for _ in range(settings["chat"]["max_function_chain_length"]):
response = self._agent.call_model(messages, functions, "auto")
message = response["choices"][0]["message"]
finish_reason = response["choices"][0]["finish_reason"]

if finish_reason == "stop":
return message["content"]

elif finish_reason == "function_call":
function_name = message["function_call"]["name"]
function_arguments = safe_loads(message["function_call"]["arguments"])
self.logger.debug(
"the model decided to call the function: %s, with parameters: %s",
function_name,
function_arguments,
)
try:
llm_matching_tool = self._function_repository.find_tool_by_name(function_name)
if llm_matching_tool is None:
raise ValueError("function {} doesn't exist".format(llm_matching_tool))
self.logger.debug("found the tool: %s", llm_matching_tool)
function_response = self.llm_function_call(
messages,
function_name,
function_arguments,
llm_matching_tool,
)
except Exception as e:
function_response = str(e)
self.logger.exception(
"Error executing function=%s, parameters=%s, error=%s",
function_name,
function_arguments,
function_response,
)

self.logger.debug("function response: %s", function_response)
messages.append(
{
"role": "function",
"name": function_name,
"content": str(function_response),
}
)

def llm_function_call(self, messages, function_name, function_arguments, llm_matching_tool):
messages.append(
{
"role": "assistant",
"content": None,
"function_call": {
"name": function_name,
"arguments": json.dumps(function_arguments),
},
}
)
llm_function = self.create_function(llm_matching_tool.get("category"))
return str(llm_function.evaluate(llm_matching_tool, function_arguments))

def create_function(self, category: str) -> LLMFunction:
self.logger.debug("create_function with category: %s", category)
lower_category = category.lower()
if lower_category == "url":
fun = URLFunction()
elif lower_category == "python":
fun = PythonFunction()
elif lower_category == "open_api":
fun = OpenApiFunction()
elif lower_category == "skill":
fun = SkillFunction(self._function_repository)
else:
raise ValueError("category is not supported:" + category)
return fun

def planner_agent(self, tool_name, agent_name):
skill = self._function_repository.find_skill_by_name(tool_name)
planner_agent_prompt = settings["planner_agent_prompt"]["system"]
if settings[agent_name + "_agent_prompt"] is not None:
special_agent_prompt = settings[agent_name + "_agent_prompt"]["system"]
else:
special_agent_prompt = settings["agent_prompt"]["system"]

# planner_agent_prompt = settings["planner_agent_prompt"]["system"]
messages = [
{"role": "system", "content": special_agent_prompt},
# {"role": "system", "content": planner_agent_prompt},
{"role": "user", "content": skill},
{"role": "system", "content": planner_agent_prompt},
]

messages.extend(
[
{"role": "user", "content": item["content"]}
for item in self._llm_conversation.get_messages()
if item["role"] == "user" and item["content"] != "yes"
]
)
messages.extend([{"role": "user", "content": skill}])

functions = self._function_lookup_strategy.find_potential_tools(self._llm_conversation)
return self._agent.call_model(messages, functions, "none")
model_response = self._agent.call_model(messages, functions, "none")
model_response_txt = model_response["choices"][0]["message"]["content"]
self.logger.debug(model_response_txt)

tasks = model_response_txt.strip().split("\n")
return [{"task": task} for task in tasks if re.match(r"^\d+\.", task)]
20 changes: 14 additions & 6 deletions genia/settings/prompts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,24 @@ if those steps mention a function call you are not familiar with, first call fun

[planner_agent_prompt]
system="""
Act as a super intelligent cloud architect with many years of experience and pragmatic approach.
take it step by step and create a plan based on industry and current company best practices.
the plan should be split into different steps to be executed in order with shared context including calling available functions.
the user must validate the plan before any function is called.
take it step by step and present a plan, validate it before you proceed.
Act as a task planner AI. Your goal, based on a user given objective, is to create a list of simple atomic steps to be executed by multiple AI assistants seperately
Use function calls with function name and parameters.
take it step by step and create a plan, validate it before you proceed.
Return the result as a numbered list, like:
#. First task
#. Second task
"""

[execution_agent_prompt]
system="""
Act as a extremely smart with pragmatic aproach cloud infrastructure engineer with expertiese in aws, gcp, python, k8s and similar technologies
Execute available functions yourself, have a proactive approach.
Be concise and provide answers which are short, informative, actionable and percise.
"""

[sre_agent_prompt]
system="""
Act as a extremely smart SRE engineer with expertiese in cloud infrastructure such as aws, gcp, python, k8s etc.
Act as a smart SRE engineer with expertiese in cloud infrastructure such as aws, gcp, python, k8s etc.
Be concise and provide answers which are short, percise and action biased.
Execute available functions yourself, have a proactive approach.
Never make assumptions about function parameters values when using function calling.
Expand Down
2 changes: 1 addition & 1 deletion genia/tools_config/skills/skills_tools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
category: skill
title: revoke production permissions
- tool_name: troubleshoot_notification_service
category: skill
category: agent_skill
agent: SRE
15 changes: 10 additions & 5 deletions genia/tools_config/skills/troubleshoot_notification_service.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
To troubleshoot a service in production, follow these steps:

1. call function 'fetch_grafana_observability_metric_data' serially to fetch all the Grafana service metrics data with the problematic service name and each time with one of the following metric name: cpu, memory, cluster_size and k8s_crash_loopbacks
2. comparing to the entire data set of 30 minutes, carefully look for a sudden increase in each of the metrics data in past few minutes as an anomaly for this metrics data, take it step by step and validate you have the right answer
3. the service consumes from a kafka queue, call function 'fetch_grafana_observability_metric_data' using its key metric 'kafka_lag_size' which means the number of input messages waiting in the queue to be handled by the service
4. call function 'fetch_k8s_service_log_data' with the problematic service name and log name 'notification-service-k8s-logs' and look for k8s errors that might have caused the issue
5. summarize your finding and in an actionalble way with recommendations what to do next
1. call function 'fetch_grafana_observability_metric_data' serially to fetch all the Grafana service metrics data with the problematic service name and each time with one of the following metric name: cpu, memory, cluster_size and k8s_crash_loopbacks, look at each of the data sets in each step seperately and detect anomalies by comparing the last 5 minutes to the preceding 25 minutes average and standard deviation, look for z-score greater than 3 or less than -3 for this metrics data, validate you have the right answer.
2. the service consumes from a kafka queue, call function 'fetch_grafana_observability_metric_data' using its key metric 'kafka_lag_size' which means the number of input messages waiting in the queue to be handled by the service
3. call function 'fetch_k8s_service_log_data' with the problematic service name and log name 'node-ui-service-k8s-logs' and look for k8s errors that might have caused the issue
4. print your finding in 3 sections:
Report:
for each data collected print the name and short description of the findings
Insights summary:
short summary and insights of the findings
Recommendations:
suggest the user what should be done next

0 comments on commit 6038aaa

Please sign in to comment.