Improve error handling, display and configurability of eval script

- Default to evaluation decision of None when either agent or evaluator llm fails. This fixes accuracy calculations on errors - Fix showing color for decision True - Enable arg flags to specify output results file paths
khoj-ai · Nov 13, 2024 · f4e3720 · f4e3720
1 parent 15b0cfa
commit f4e3720
Showing 1 changed file with 64 additions and 34 deletions.
diff --git a/tests/eval_frames.py b/tests/eval_frames.py
@@ -1,18 +1,20 @@
+import argparse
 import concurrent.futures
 import json
 import logging
 import os
 import time
+from datetime import datetime
 from typing import Any, Dict
 
 import pandas as pd
 import requests
 from datasets import load_dataset
 
-from khoj.utils.helpers import timer
+from khoj.utils.helpers import is_none_or_empty, timer
 
 # Configure root logger
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
+logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
 
 # Configuration
@@ -82,23 +84,28 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic
     try:
         response = requests.post(
             GEMINI_API_URL,
-            headers={"Content-Type": "application/json", "response_mime_type": "application/json"},
-            json={"contents": [{"parts": [{"text": evaluation_prompt}]}]},
+            headers={"Content-Type": "application/json"},
+            json={
+                "contents": [{"parts": [{"text": evaluation_prompt}]}],
+                "generationConfig": {"response_mime_type": "application/json"},
+            },
         )
         response.raise_for_status()
 
         # Parse evaluation response
-        eval_response = json.loads(clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"]))
-        if "decision" in eval_response and isinstance(eval_response["decision"], str):
-            eval_response["decision"] = eval_response["decision"].upper() == "TRUE"
+        eval_response: dict[str, str] = json.loads(
+            clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"])
+        )
+        decision = str(eval_response.get("decision", "")).upper() == "TRUE"
+        explanation = eval_response.get("explanation", "")
+        # Handle evaluation service errors
+        if "503 Service Error" in explanation:
+            decision = None
         # Extract decision and explanation from structured response
-        return {
-            "decision": eval_response.get("decision", False),
-            "explanation": eval_response.get("explanation", ""),
-        }
+        return decision, explanation
     except Exception as e:
         logger.error(f"Error in evaluation: {e}")
-        return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"}
+        return None, f"Evaluation failed: {str(e)}"
 
 
 def process_batch(batch, batch_start, results, dataset_length):
@@ -107,17 +114,17 @@ def process_batch(batch, batch_start, results, dataset_length):
         logger.info(f"Processing example: {current_index}/{dataset_length}")
 
         # Trigger research mode if enabled
-        prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
+        prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE and not prompt.startswith(f"/{KHOJ_MODE}") else prompt
 
         # Get agent response
         agent_response = get_agent_response(prompt)
 
         # Evaluate response
-        if agent_response is None or agent_response.strip() == "":
-            evaluation["decision"] = False
-            evaluation["explanation"] = "Agent response is empty. This maybe due to a service error."
+        if is_none_or_empty(agent_response):
+            decision = None
+            explanation = "Agent response is empty. This maybe due to a service error."
         else:
-            evaluation = evaluate_response(prompt, agent_response, answer)
+            decision, explanation = evaluate_response(prompt, agent_response, answer)
 
         # Store results
         results.append(
@@ -126,25 +133,29 @@ def process_batch(batch, batch_start, results, dataset_length):
                 "prompt": prompt,
                 "ground_truth": answer,
                 "agent_response": agent_response,
-                "evaluation_decision": evaluation["decision"],
-                "evaluation_explanation": evaluation["explanation"],
+                "evaluation_decision": decision,
+                "evaluation_explanation": explanation,
                 "reasoning_type": reasoning_type,
             }
         )
 
-        # Color the decision based on its value
-        decision = evaluation["decision"]
-        decision_color = "green" if decision == True else "red"
+        # Log results
+        decision_color = {True: "green", None: "blue", False: "red"}[decision]
         colored_decision = color_text(str(decision), decision_color)
         logger.info(
-            f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n'
+            f"Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {explanation}\n"
         )
 
         time.sleep(SLEEP_SECONDS)  # Rate limiting
 
 
 def color_text(text, color):
-    colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"}
+    colors = {
+        "red": "\033[91m",  # Bright red
+        "green": "\033[32m",  # Standard green
+        "blue": "\033[94m",  # Bright blue
+        "reset": "\033[0m",
+    }
     return f"{colors[color]}{text}{colors['reset']}"
 
 
@@ -153,15 +164,28 @@ def clean_json(response: str):
     return response.strip().replace("\n", "").removeprefix("```json").removesuffix("```")
 
 
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluate Khoj on the Google FRAMES benchmark.")
+    parser.add_argument(
+        "--output",
+        "-o",
+        default=None,
+        help="Path to store evaluation results CSV (default: frames_evaluation_results_[datetime].csv)",
+    )
+    return parser.parse_args()
+
+
 def main():
+    # Initialize variables
+    args = parse_args()
+
     # Load dataset
     with timer("Loaded dataset in", logger):
         dataset = load_frames_dataset()
     if dataset is None:
         return
 
     # Initialize variables
-    counter = 0
     results = []
     dataset_length = len(dataset["Prompt"])
 
@@ -182,31 +206,37 @@ def main():
 
     # Calculate metrics
     df = pd.DataFrame(results)
-    accuracy = (df["evaluation_decision"] == True).mean()
+    eval_df = df.dropna(subset=["evaluation_decision"])  # Exclude rows with missing evaluation decision
+    accuracy = (eval_df["evaluation_decision"] == True).mean()
 
     # Calculate accuracy by reasoning type
-    reasoning_type_accuracy = df.groupby("reasoning_type")["evaluation_decision"].apply(lambda x: (x == True).mean())
-
-    # Save results
-    df.to_csv("frames_evaluation_results.csv", index=False)
+    reasoning_type_accuracy = eval_df.groupby("reasoning_type")["evaluation_decision"].apply(
+        lambda x: (x == True).mean()
+    )
 
     # Print summary
     logger.info(f"\nOverall Accuracy: {accuracy:.2%}")
-    logger.info("\nAccuracy by Reasoning Type:")
-    logger.info(reasoning_type_accuracy)
+    logger.info(f"\nAccuracy by Reasoning Type:\n{reasoning_type_accuracy}")
+
+    # Save results
+    output_file = args.output or f"frames_evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
+    df.to_csv(output_file, index=False)
+    logger.info(f"Results saved to {output_file}")
 
 
 if __name__ == "__main__":
     """
     Evaluate Khoj on the Google FRAMES benchmark.
     Response are evaluated by GEMINI_EVAL_MODEL (default: gemini-pro-1.5-002).
 
-    Khoj should be running at KHOJ_URL, default at http://localhost:42110.
+    Khoj should be running at KHOJ_URL (default: http://localhost:42110).
     The Gemini judge model is accessed via the Gemini API with your GEMINI_API_KEY.
     To evaluate Khoj in research mode, set the KHOJ_MODE environment variable to "research".
 
     Run the script using the following command:
     KHOJ_MODE="research" GEMINI_API_KEY="<your_gemini_api_key>" python eval_frames.py
     """
-    with timer("Ran eval in", logger):
+    logger.info(f"{datetime.now()} - Begin Quizzing Khoj on the FRAMES benchmark.")
+    with timer("Ran eval script in", logger, log_level=logging.INFO):
         main()
+    logger.info(f"{datetime.now()} - End Quizzing Khoj on the FRAMES benchmark.")