troubleshoot notification service by multiple agents

genia-dev · Sep 4, 2023 · 6038aaa · 6038aaa
1 parent becea38
commit 6038aaa
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 23 deletions.
diff --git a/data/demo/logs/node-ui-service-k8s-logs.txt b/data/demo/logs/node-ui-service-k8s-logs.txt
@@ -0,0 +1,4 @@
+August 31, 2023 1:27:12 node-01 kubelet[12345]: I0628 1:27:12.456789 node-01 kubelet[12345]: OOMKilling POD uid: "b1234c56-d78e-11e9-8a28-0242ac110002", name: "node-ui-service-pod", namespace: "my-namespace", container: "node-ui-service-container", Memory cgroup out of memory: Kill process 5678 (node-ui-service-container) score 1000 or sacrifice child
+August 31, 2023 1:27:12 node-01 kernel: [345678.123456] Memory cgroup out of memory: Kill process 5678 (node-ui-service-container) score 1000 or sacrifice child
+August 31, 2023 1:27:12 node-01 kernel: [345678.123457] Killed process 5678 (node-ui-service-container) total-vm:123456kB, anon-rss:12345kB, file-rss:0kB, shmem-rss:0kB
+August 31, 2023 1:27:12 node-01 kubelet[12345]: I0628 1:27:12.456799 node-01 kubelet[12345]: pod "node-ui-service-pod_my-namespace(b1234c56-d78e-11e9-8a28-0242ac110002)" failed due to OOM Killer.
diff --git a/data/demo/logs/notification-service-k8s-logs.txt b/data/demo/logs/notification-service-k8s-logs.txt
diff --git a/genia/llm_function/agent_skill_function.py b/genia/llm_function/agent_skill_function.py
@@ -1,13 +1,21 @@
+from collections import deque
+import json
 import logging
+import re
 
 from traitlets import Any
 from genia.agents.open_ai import OpenAIToolsEmpoweredAgent
 from genia.conversation.llm_conversation import LLMConversation
 
 from genia.llm_function.llm_function import LLMFunction
 from genia.llm_function.llm_function_repository import LLMFunctionRepository
+from genia.llm_function.llm_skill_function import SkillFunction
+from genia.llm_function.open_api_function import OpenApiFunction
+from genia.llm_function.python_function import PythonFunction
+from genia.llm_function.url_function import URLFunction
 from genia.llm_function_lookup_strategy.llm_function_lookup_strategy import LLMFunctionLookupStrategy
 from genia.settings_loader import settings
+from genia.utils.utils import safe_loads
 
 
 class AgentSkillFunction(LLMFunction):
@@ -32,21 +40,141 @@ def __init__(
 
  def evaluate(self, function_config: dict, parameters: dict) -> Any:
  tool_name = function_config["tool_name"]
- skill = self._function_repository.find_skill_by_name(tool_name)
  # i.e agent: SRE
  agent_name = function_config["agent"]
  self.logger.debug(f"AgentSkillFunction: {agent_name} {tool_name}")
- special_agent_prompt = ""
+ agents_ctx = []
+ tasks_list = deque()
+
+ new_tasks = self.planner_agent(tool_name, agent_name)
+ for new_task in new_tasks:
+ task_parts = new_task["task"].strip().split(".", 1)
+ if len(task_parts) == 2:
+ task_id = task_parts[0].strip()
+ task_name = task_parts[1].strip()
+ tasks_list.append({"task_id": task_id, "task": task_name})
+
+ while len(tasks_list) > 0:
+ new_task = tasks_list.popleft()
+ # Send to execution function to complete the task based on the context
+ agents_ctx.append({"role": "user", "content": new_task["task"]})
+ result = self.execution_agent(agents_ctx, agent_name)
+ agents_ctx.append({"role": "assistant", "content": result})
+
+ return result
+
+ def execution_agent(self, agents_ctx, agent_name):
+ messages = []
+ execution_agent_prompt = settings["execution_agent_prompt"]["system"]
+ if settings[agent_name + "_agent_prompt"] is not None:
+ messages.append({"role": "system", "content": settings[agent_name + "_agent_prompt"]["system"]})
+
+ messages.append({"role": "system", "content": execution_agent_prompt})
+ messages.extend(agents_ctx)
+
+ functions = self._function_lookup_strategy.find_potential_tools(self._llm_conversation)
+ return self.call_model(messages, functions, "auto")
+
+ def call_model(self, messages, functions, mode):
+ for _ in range(settings["chat"]["max_function_chain_length"]):
+ response = self._agent.call_model(messages, functions, "auto")
+ message = response["choices"][0]["message"]
+ finish_reason = response["choices"][0]["finish_reason"]
+
+ if finish_reason == "stop":
+ return message["content"]
+
+ elif finish_reason == "function_call":
+ function_name = message["function_call"]["name"]
+ function_arguments = safe_loads(message["function_call"]["arguments"])
+ self.logger.debug(
+ "the model decided to call the function: %s, with parameters: %s",
+ function_name,
+ function_arguments,
+ )
+ try:
+ llm_matching_tool = self._function_repository.find_tool_by_name(function_name)
+ if llm_matching_tool is None:
+ raise ValueError("function {} doesn't exist".format(llm_matching_tool))
+ self.logger.debug("found the tool: %s", llm_matching_tool)
+ function_response = self.llm_function_call(
+ messages,
+ function_name,
+ function_arguments,
+ llm_matching_tool,
+ )
+ except Exception as e:
+ function_response = str(e)
+ self.logger.exception(
+ "Error executing function=%s, parameters=%s, error=%s",
+ function_name,
+ function_arguments,
+ function_response,
+ )
+
+ self.logger.debug("function response: %s", function_response)
+ messages.append(
+ {
+ "role": "function",
+ "name": function_name,
+ "content": str(function_response),
+ }
+ )
+
+ def llm_function_call(self, messages, function_name, function_arguments, llm_matching_tool):
+ messages.append(
+ {
+ "role": "assistant",
+ "content": None,
+ "function_call": {
+ "name": function_name,
+ "arguments": json.dumps(function_arguments),
+ },
+ }
+ )
+ llm_function = self.create_function(llm_matching_tool.get("category"))
+ return str(llm_function.evaluate(llm_matching_tool, function_arguments))
+
+ def create_function(self, category: str) -> LLMFunction:
+ self.logger.debug("create_function with category: %s", category)
+ lower_category = category.lower()
+ if lower_category == "url":
+ fun = URLFunction()
+ elif lower_category == "python":
+ fun = PythonFunction()
+ elif lower_category == "open_api":
+ fun = OpenApiFunction()
+ elif lower_category == "skill":
+ fun = SkillFunction(self._function_repository)
+ else:
+ raise ValueError("category is not supported:" + category)
+ return fun
+
+ def planner_agent(self, tool_name, agent_name):
+ skill = self._function_repository.find_skill_by_name(tool_name)
+ planner_agent_prompt = settings["planner_agent_prompt"]["system"]
  if settings[agent_name + "_agent_prompt"] is not None:
  special_agent_prompt = settings[agent_name + "_agent_prompt"]["system"]
  else:
  special_agent_prompt = settings["agent_prompt"]["system"]
-
- # planner_agent_prompt = settings["planner_agent_prompt"]["system"]
  messages = [
  {"role": "system", "content": special_agent_prompt},
- # {"role": "system", "content": planner_agent_prompt},
- {"role": "user", "content": skill},
+ {"role": "system", "content": planner_agent_prompt},
  ]
+
+ messages.extend(
+ [
+ {"role": "user", "content": item["content"]}
+ for item in self._llm_conversation.get_messages()
+ if item["role"] == "user" and item["content"] != "yes"
+ ]
+ )
+ messages.extend([{"role": "user", "content": skill}])
+
  functions = self._function_lookup_strategy.find_potential_tools(self._llm_conversation)
- return self._agent.call_model(messages, functions, "none")
+ model_response = self._agent.call_model(messages, functions, "none")
+ model_response_txt = model_response["choices"][0]["message"]["content"]
+ self.logger.debug(model_response_txt)
+
+ tasks = model_response_txt.strip().split("\n")
+ return [{"task": task} for task in tasks if re.match(r"^\d+\.", task)]
diff --git a/genia/settings/prompts.toml b/genia/settings/prompts.toml
@@ -26,16 +26,24 @@ if those steps mention a function call you are not familiar with, first call fun
 
 [planner_agent_prompt]
 system="""
- Act as a super intelligent cloud architect with many years of experience and pragmatic approach.
- take it step by step and create a plan based on industry and current company best practices.
- the plan should be split into different steps to be executed in order with shared context including calling available functions.
- the user must validate the plan before any function is called.
- take it step by step and present a plan, validate it before you proceed.
+Act as a task planner AI. Your goal, based on a user given objective, is to create a list of simple atomic steps to be executed by multiple AI assistants seperately 
+Use function calls with function name and parameters. 
+take it step by step and create a plan, validate it before you proceed.
+Return the result as a numbered list, like:
+ #. First task
+ #. Second task
+"""
+
+[execution_agent_prompt]
+system="""
+ Act as a extremely smart with pragmatic aproach cloud infrastructure engineer with expertiese in aws, gcp, python, k8s and similar technologies
+ Execute available functions yourself, have a proactive approach.
+ Be concise and provide answers which are short, informative, actionable and percise. 
 """
 
 [sre_agent_prompt]
 system="""
- Act as a extremely smart SRE engineer with expertiese in cloud infrastructure such as aws, gcp, python, k8s etc.
+ Act as a smart SRE engineer with expertiese in cloud infrastructure such as aws, gcp, python, k8s etc.
  Be concise and provide answers which are short, percise and action biased.
  Execute available functions yourself, have a proactive approach.
  Never make assumptions about function parameters values when using function calling.

diff --git a/genia/tools_config/skills/skills_tools.yaml b/genia/tools_config/skills/skills_tools.yaml
@@ -5,5 +5,5 @@
  category: skill
  title: revoke production permissions
 - tool_name: troubleshoot_notification_service
- category: skill
+ category: agent_skill
  agent: SRE
diff --git a/genia/tools_config/skills/troubleshoot_notification_service.txt b/genia/tools_config/skills/troubleshoot_notification_service.txt
@@ -1,7 +1,12 @@
 To troubleshoot a service in production, follow these steps:
 
-1. call function 'fetch_grafana_observability_metric_data' serially to fetch all the Grafana service metrics data with the problematic service name and each time with one of the following metric name: cpu, memory, cluster_size and k8s_crash_loopbacks
-2. comparing to the entire data set of 30 minutes, carefully look for a sudden increase in each of the metrics data in past few minutes as an anomaly for this metrics data, take it step by step and validate you have the right answer
-3. the service consumes from a kafka queue, call function 'fetch_grafana_observability_metric_data' using its key metric 'kafka_lag_size' which means the number of input messages waiting in the queue to be handled by the service
-4. call function 'fetch_k8s_service_log_data' with the problematic service name and log name 'notification-service-k8s-logs' and look for k8s errors that might have caused the issue
-5. summarize your finding and in an actionalble way with recommendations what to do next
+1. call function 'fetch_grafana_observability_metric_data' serially to fetch all the Grafana service metrics data with the problematic service name and each time with one of the following metric name: cpu, memory, cluster_size and k8s_crash_loopbacks, look at each of the data sets in each step seperately and detect anomalies by comparing the last 5 minutes to the preceding 25 minutes average and standard deviation, look for z-score greater than 3 or less than -3 for this metrics data, validate you have the right answer.
+2. the service consumes from a kafka queue, call function 'fetch_grafana_observability_metric_data' using its key metric 'kafka_lag_size' which means the number of input messages waiting in the queue to be handled by the service
+3. call function 'fetch_k8s_service_log_data' with the problematic service name and log name 'node-ui-service-k8s-logs' and look for k8s errors that might have caused the issue
+4. print your finding in 3 sections:
+Report:
+for each data collected print the name and short description of the findings
+Insights summary:
+short summary and insights of the findings
+Recommendations: 
+suggest the user what should be done next