From a67512d742216012bc38b67ce0bb9cc5f3341365 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 12:16:34 +0100
Subject: [PATCH 01/15] QA eval dataset as argument, with hotpot and
 2wikimultihop as options. Json schema validation for datasets.

---
 evals/eval_on_hotpot.py | 78 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 13 deletions(-)

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index e07e80e0c..40f4fe9d3 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -8,15 +8,68 @@
 import wget
 from deepeval.dataset import EvaluationDataset
 from deepeval.test_case import LLMTestCase
+from jsonschema import ValidationError, validate
 from tqdm import tqdm
 
 import cognee
 import evals.deepeval_metrics
 from cognee.api.v1.search import SearchType
-from cognee.base_config import get_base_config
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
+from cognee.root_dir import get_absolute_path
 
+qa_datasets = {
+    "hotpotqa": {
+        "filename": "hotpot_dev_fullwiki_v1.json",
+        "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json"
+    },
+    "2wikimultihop": {
+        "filename": "data/dev.json",
+        "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1"
+    }
+}
+
+qa_json_schema = {
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "answer": {"type": "string"},
+            "question": {"type": "string"},
+            "context": {"type": "array"},
+        },
+        "required": ["answer", "question", "context"], 
+        "additionalProperties": True 
+    } 
+}
+
+
+def download_qa_dataset(dataset_name: str, dir: str):
+    
+    if dataset_name not in qa_datasets:
+        raise ValueError(f"{dataset_name} is not a supported dataset.")
+
+    url = qa_datasets[dataset_name]["URL"]
+
+    if dataset_name == "2wikimultihop":
+        raise Exception("Please download 2wikimultihop dataset (data.zip) manually from \
+                        https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \
+                        and unzip it.")
+
+    wget.download(url, out=dir) 
+
+
+def load_qa_dataset(filepath: Path):
+
+    with open(filepath, "r") as file:
+        dataset = json.load(file)
+
+    try:
+        validate(instance=dataset, schema=qa_json_schema)
+    except ValidationError as e:
+        print("File is not a valid QA dataset:", e.message)   
+
+    return dataset
 
 async def answer_without_cognee(instance):
     args = {
@@ -39,9 +92,8 @@ async def answer_with_cognee(instance):
     await cognee.prune.prune_system(metadata=True)
     
     for (title, sentences) in instance["context"]:
-        await cognee.add("\n".join(sentences), dataset_name = "HotPotQA")
-    
-    await cognee.cognify("HotPotQA")
+        await cognee.add("\n".join(sentences), dataset_name = "QA")
+    await cognee.cognify("QA")
 
     search_results = await cognee.search(
         SearchType.INSIGHTS, query_text=instance["question"]
@@ -80,20 +132,19 @@ async def eval_answers(instances, answers, eval_metric):
     
     return eval_results
 
-async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric):
-    base_config = get_base_config()
-    data_root_dir = base_config.data_root_directory
+async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, eval_metric):
+    
+    data_root_dir = get_absolute_path("../.data")
     
     if not Path(data_root_dir).exists():
         Path(data_root_dir).mkdir()
     
-    filepath = data_root_dir / Path("hotpot_dev_fullwiki_v1.json")
+    filename = qa_datasets[dataset_name]["filename"]
+    filepath = data_root_dir / Path(filename)
     if not filepath.exists():
-        url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'
-        wget.download(url, out=data_root_dir)
+        download_qa_dataset(dataset_name, data_root_dir)
     
-    with open(filepath, "r") as file:
-        dataset = json.load(file)
+    dataset = load_qa_dataset(filepath)
     
     instances = dataset if not num_samples else dataset[:num_samples]
     answers = []
@@ -109,6 +160,7 @@ async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     
+    parser.add_argument("--dataset", type=str, choices=list(qa_datasets.keys()), help="Which dataset to evaluate on")
     parser.add_argument("--with_cognee", action="store_true")
     parser.add_argument("--num_samples", type=int, default=500)
     parser.add_argument("--metric", type=str, default="correctness_metric",
@@ -130,5 +182,5 @@ async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric):
     else:
         answer_provider = answer_without_cognee
     
-    avg_score = asyncio.run(eval_on_hotpotQA(answer_provider, args.num_samples, metric))
+    avg_score = asyncio.run(eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, metric))
     print(f"Average {args.metric}: {avg_score}")
\ No newline at end of file

From e0a8c19172f606128123f6936105b94db7e0e706 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 15:32:22 +0100
Subject: [PATCH 02/15] Load dataset file by filename, outsource utilities

---
 evals/eval_on_hotpot.py   | 80 +++------------------------------------
 evals/qa_dataset_utils.py | 74 ++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 74 deletions(-)
 create mode 100644 evals/qa_dataset_utils.py

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index 0558edc99..ee2435e6b 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -1,14 +1,9 @@
 import argparse
 import asyncio
-import json
 import statistics
-from pathlib import Path
-
 import deepeval.metrics
-import wget
 from deepeval.dataset import EvaluationDataset
 from deepeval.test_case import LLMTestCase
-from jsonschema import ValidationError, validate
 from tqdm import tqdm
 
 import cognee
@@ -16,60 +11,7 @@
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
-from cognee.root_dir import get_absolute_path
-
-qa_datasets = {
-    "hotpotqa": {
-        "filename": "hotpot_dev_fullwiki_v1.json",
-        "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json",
-    },
-    "2wikimultihop": {
-        "filename": "data/dev.json",
-        "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1",
-    },
-}
-
-qa_json_schema = {
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "answer": {"type": "string"},
-            "question": {"type": "string"},
-            "context": {"type": "array"},
-        },
-        "required": ["answer", "question", "context"],
-        "additionalProperties": True,
-    },
-}
-
-
-def download_qa_dataset(dataset_name: str, dir: str):
-    if dataset_name not in qa_datasets:
-        raise ValueError(f"{dataset_name} is not a supported dataset.")
-
-    url = qa_datasets[dataset_name]["URL"]
-
-    if dataset_name == "2wikimultihop":
-        raise Exception(
-            "Please download 2wikimultihop dataset (data.zip) manually from \
-                        https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \
-                        and unzip it."
-        )
-
-    wget.download(url, out=dir)
-
-
-def load_qa_dataset(filepath: Path):
-    with open(filepath, "r") as file:
-        dataset = json.load(file)
-
-    try:
-        validate(instance=dataset, schema=qa_json_schema)
-    except ValidationError as e:
-        print("File is not a valid QA dataset:", e.message)
-
-    return dataset
+from evals.qa_dataset_utils import load_qa_dataset
 
 
 async def answer_without_cognee(instance):
@@ -135,18 +77,10 @@ async def eval_answers(instances, answers, eval_metric):
     return eval_results
 
 
-async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, eval_metric):
-    data_root_dir = get_absolute_path("../.data")
-
-    if not Path(data_root_dir).exists():
-        Path(data_root_dir).mkdir()
-
-    filename = qa_datasets[dataset_name]["filename"]
-    filepath = data_root_dir / Path(filename)
-    if not filepath.exists():
-        download_qa_dataset(dataset_name, data_root_dir)
-
-    dataset = load_qa_dataset(filepath)
+async def eval_on_QA_dataset(
+    dataset_name_or_filename: str, answer_provider, num_samples, eval_metric
+):
+    dataset = load_qa_dataset(dataset_name_or_filename)
 
     instances = dataset if not num_samples else dataset[:num_samples]
     answers = []
@@ -165,9 +99,7 @@ async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, ev
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        "--dataset", type=str, choices=list(qa_datasets.keys()), help="Which dataset to evaluate on"
-    )
+    parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on")
     parser.add_argument("--with_cognee", action="store_true")
     parser.add_argument("--num_samples", type=int, default=500)
     parser.add_argument(
diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py
new file mode 100644
index 000000000..bb6dd5bbb
--- /dev/null
+++ b/evals/qa_dataset_utils.py
@@ -0,0 +1,74 @@
+from cognee.root_dir import get_absolute_path
+import json
+import wget
+from jsonschema import ValidationError, validate
+from pathlib import Path
+
+
+qa_datasets = {
+    "hotpotqa": {
+        "filename": "hotpot_dev_fullwiki_v1.json",
+        "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json",
+    },
+    "2wikimultihop": {
+        "filename": "data/dev.json",
+        "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1",
+    },
+}
+
+qa_json_schema = {
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "answer": {"type": "string"},
+            "question": {"type": "string"},
+            "context": {"type": "array"},
+        },
+        "required": ["answer", "question", "context"],
+        "additionalProperties": True,
+    },
+}
+
+
+def download_qa_dataset(dataset_name: str, dir: str):
+    if dataset_name not in qa_datasets:
+        raise ValueError(f"{dataset_name} is not a supported dataset.")
+
+    url = qa_datasets[dataset_name]["URL"]
+
+    if dataset_name == "2wikimultihop":
+        raise Exception(
+            "Please download 2wikimultihop dataset (data.zip) manually from \
+                        https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \
+                        and unzip it."
+        )
+
+    wget.download(url, out=dir)
+
+
+def load_qa_dataset(dataset_name_or_filename: str):
+    if dataset_name_or_filename in qa_datasets:
+        dataset_name = dataset_name_or_filename
+        filename = qa_datasets[dataset_name]["filename"]
+
+        data_root_dir = get_absolute_path("../.data")
+        if not Path(data_root_dir).exists():
+            Path(data_root_dir).mkdir()
+
+        filepath = data_root_dir / Path(filename)
+        if not filepath.exists():
+            download_qa_dataset(dataset_name, data_root_dir)
+    else:
+        filename = dataset_name_or_filename
+        filepath = Path(filename)
+
+    with open(filepath, "r") as file:
+        dataset = json.load(file)
+
+    try:
+        validate(instance=dataset, schema=qa_json_schema)
+    except ValidationError as e:
+        print("File is not a valid QA dataset:", e.message)
+
+    return dataset

From 49fb0535036d97f3c9fa50fbfa03b2bdd368a743 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Sat, 11 Jan 2025 13:48:50 +0100
Subject: [PATCH 03/15] restructure metric selection

---
 evals/eval_on_hotpot.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index ee2435e6b..6c924c84c 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -12,6 +12,7 @@
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
 from evals.qa_dataset_utils import load_qa_dataset
+from evals.qa_metrics_utils import get_metric
 
 
 async def answer_without_cognee(instance):
@@ -78,9 +79,10 @@ async def eval_answers(instances, answers, eval_metric):
 
 
 async def eval_on_QA_dataset(
-    dataset_name_or_filename: str, answer_provider, num_samples, eval_metric
+    dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name
 ):
     dataset = load_qa_dataset(dataset_name_or_filename)
+    eval_metric = get_metric(eval_metric_name)
 
     instances = dataset if not num_samples else dataset[:num_samples]
     answers = []
@@ -102,30 +104,16 @@ async def eval_on_QA_dataset(
     parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on")
     parser.add_argument("--with_cognee", action="store_true")
     parser.add_argument("--num_samples", type=int, default=500)
-    parser.add_argument(
-        "--metric",
-        type=str,
-        default="correctness_metric",
-        help="Valid options are Deepeval metrics (e.g. AnswerRelevancyMetric) \
-                              and metrics defined in evals/deepeval_metrics.py, e.g. f1_score_metric",
-    )
+    parser.add_argument("--metric_name", type=str, default="Correctness")
 
     args = parser.parse_args()
 
-    try:
-        metric_cls = getattr(deepeval.metrics, args.metric)
-        metric = metric_cls()
-    except AttributeError:
-        metric = getattr(evals.deepeval_metrics, args.metric)
-        if isinstance(metric, type):
-            metric = metric()
-
     if args.with_cognee:
         answer_provider = answer_with_cognee
     else:
         answer_provider = answer_without_cognee
 
     avg_score = asyncio.run(
-        eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, metric)
+        eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name)
     )
     print(f"Average {args.metric}: {avg_score}")

From 13422ba4aef382b0d14941109e3a0d5960135104 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Sat, 11 Jan 2025 14:24:07 +0100
Subject: [PATCH 04/15] Add comprehensiveness, diversity and empowerment
 metrics

---
 evals/deepeval_metrics.py | 39 +++++++++++++++++++++++++++++++++++++++
 evals/eval_on_hotpot.py   |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/evals/deepeval_metrics.py b/evals/deepeval_metrics.py
index 9ce1e9e4f..6ef8e822f 100644
--- a/evals/deepeval_metrics.py
+++ b/evals/deepeval_metrics.py
@@ -12,6 +12,45 @@
     ],
 )
 
+comprehensiveness_metric = GEval(
+    name="Comprehensiveness",
+    model="gpt-4o-mini",
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ],
+    evaluation_steps=[
+        "Determine how much detail the answer provides to cover all the aspects and details of the question."
+    ],
+)
+
+diversity_metric = GEval(
+    name="Diversity",
+    model="gpt-4o-mini",
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ],
+    evaluation_steps=[
+        "Determine how varied and rich the answer is in providing different perspectives and insights on the question."
+    ],
+)
+
+empowerment_metric = GEval(
+    name="Empowerment",
+    model="gpt-4o-mini",
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ],
+    evaluation_steps=[
+        "Determine how well the answer helps the reader understand and make informed judgements about the topic."
+    ],
+)
+
 
 class f1_score_metric(BaseMetric):
     """F1 score taken directly from the official hotpot benchmark
diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index 6c924c84c..20a965106 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -116,4 +116,4 @@ async def eval_on_QA_dataset(
     avg_score = asyncio.run(
         eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name)
     )
-    print(f"Average {args.metric}: {avg_score}")
+    print(f"Average {args.metric_name}: {avg_score}")

From d57609db4b59492519c0df7a799bd0e216fe5056 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Mon, 13 Jan 2025 10:37:19 +0100
Subject: [PATCH 05/15] add promptfoo as an option

---
 evals/eval_on_hotpot.py    |  14 +++-
 evals/promptfoo_metrics.py |  53 +++++++++++++
 evals/promptfoo_wrapper.py | 154 +++++++++++++++++++++++++++++++++++++
 evals/promptfooconfig.yaml |  13 ++++
 evals/promptfooprompt.json |  10 +++
 evals/qa_metrics_utils.py  |  44 +++++++++++
 6 files changed, 284 insertions(+), 4 deletions(-)
 create mode 100644 evals/promptfoo_metrics.py
 create mode 100644 evals/promptfoo_wrapper.py
 create mode 100644 evals/promptfooconfig.yaml
 create mode 100644 evals/promptfooprompt.json
 create mode 100644 evals/qa_metrics_utils.py

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index 20a965106..d4af283e2 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -1,13 +1,11 @@
 import argparse
 import asyncio
 import statistics
-import deepeval.metrics
 from deepeval.dataset import EvaluationDataset
 from deepeval.test_case import LLMTestCase
 from tqdm import tqdm
 
 import cognee
-import evals.deepeval_metrics
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
@@ -32,7 +30,7 @@ async def answer_without_cognee(instance):
     return answer_prediction
 
 
-async def answer_with_cognee(instance):
+async def get_context_with_cognee(instance):
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
@@ -45,7 +43,11 @@ async def answer_with_cognee(instance):
         SearchType.SUMMARIES, query_text=instance["question"]
     )
     search_results = search_results + search_results_second
+    return search_results
+
 
+async def answer_with_cognee(instance):
+    search_results = get_context_with_cognee(instance)
     args = {
         "question": instance["question"],
         "context": search_results,
@@ -82,9 +84,13 @@ async def eval_on_QA_dataset(
     dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name
 ):
     dataset = load_qa_dataset(dataset_name_or_filename)
-    eval_metric = get_metric(eval_metric_name)
 
+    eval_metric = get_metric(eval_metric_name)
     instances = dataset if not num_samples else dataset[:num_samples]
+
+    if eval_metric_name.startswith("promptfoo"):
+        return await eval_metric.measure(instances)
+
     answers = []
     for instance in tqdm(instances, desc="Getting answers"):
         answer = await answer_provider(instance)
diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py
new file mode 100644
index 000000000..1806c105b
--- /dev/null
+++ b/evals/promptfoo_metrics.py
@@ -0,0 +1,53 @@
+from evals.promptfoo_wrapper import PromptfooWrapper
+import os
+from deepeval.test_case import LLMTestCase
+import yaml
+import json
+
+
+class PromptfooComprehensiveness:
+    def __init__(self, threshold: float = 0.5):
+        self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo")
+        self.threshold = threshold
+
+    async def measure(self, instances):
+        with open(os.path.join(os.getcwd(), "evals/promptfooconfig.yaml"), "r") as file:
+            config = yaml.safe_load(file)
+
+        # creating config file
+        tests = []
+        for instance in instances:
+            from evals.eval_on_hotpot import get_context_with_cognee
+
+            context = await get_context_with_cognee(instance)
+            test = {
+                "vars": {
+                    "name": instance["question"][:15],
+                    "question": instance["question"],
+                    "context": str(context),
+                }
+            }
+            tests.append(test)
+        config["tests"] = tests
+
+        # Write the updated YAML back, preserving formatting and structure
+        updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml")
+        with open(updated_yaml_file_path, "w") as file:
+            yaml.dump(config, file)
+
+        self.wrapper.run_eval(
+            prompt_file=os.path.join(os.getcwd(), "evals/promptfooprompt.json"),
+            config_file=os.path.join(os.getcwd(), "config_with_context.yaml"),
+            out_format="json",
+        )
+
+        file_path = os.path.join(os.getcwd(), "benchmark_results.json")
+
+        # Read and parse the JSON file
+        with open(file_path, "r") as file:
+            results = json.load(file)
+
+        self.score = results["results"]["prompts"][0]["metrics"]["score"]
+
+        self.success = self.score >= self.threshold
+        return self.score
diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py
new file mode 100644
index 000000000..0ed2d4850
--- /dev/null
+++ b/evals/promptfoo_wrapper.py
@@ -0,0 +1,154 @@
+import subprocess
+import json
+import logging
+import os
+from typing import List, Optional, Dict, Generator
+import shutil
+import platform
+from dotenv import load_dotenv
+
+logger = logging.getLogger(__name__)
+
+# Load environment variables from .env file
+load_dotenv()
+
+
+class PromptfooWrapper:
+    """
+    A Python wrapper class around the promptfoo CLI tool, allowing you to:
+    - Evaluate prompts against different language models.
+    - Compare responses from multiple models.
+    - Pass configuration and prompt files.
+    - Retrieve the outputs in a structured format, including binary output if needed.
+
+    This class assumes you have the promptfoo CLI installed and accessible in your environment.
+    For more details on promptfoo, see: https://github.com/promptfoo/promptfoo
+    """
+
+    def __init__(self, promptfoo_path: str = ""):
+        """
+        Initialize the wrapper with the path to the promptfoo executable.
+
+        :param promptfoo_path: Path to the promptfoo binary (default: 'promptfoo')
+        """
+        self.promptfoo_path = promptfoo_path
+        logger.debug(f"Initialized PromptfooWrapper with binary at: {self.promptfoo_path}")
+
+    def _validate_path(self, file_path: Optional[str]) -> None:
+        """
+        Validate that a file path is accessible if provided.
+        Raise FileNotFoundError if it does not exist.
+        """
+        if file_path and not os.path.isfile(file_path):
+            logger.error(f"File not found: {file_path}")
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+    def _get_node_bin_dir(self) -> str:
+        """
+        Determine the Node.js binary directory dynamically for macOS and Linux.
+        """
+        node_executable = shutil.which("node")
+        if not node_executable:
+            logger.error("Node.js is not installed or not found in the system PATH.")
+            raise EnvironmentError("Node.js is not installed or not in PATH.")
+
+        # Determine the Node.js binary directory
+        node_bin_dir = os.path.dirname(node_executable)
+
+        # Special handling for macOS, where Homebrew installs Node in /usr/local or /opt/homebrew
+        if platform.system() == "Darwin":  # macOS
+            logger.debug("Running on macOS")
+            brew_prefix = os.popen("brew --prefix node").read().strip()
+            if brew_prefix and os.path.exists(brew_prefix):
+                node_bin_dir = os.path.join(brew_prefix, "bin")
+                logger.debug(f"Detected Node.js binary directory using Homebrew: {node_bin_dir}")
+
+        # For Linux, Node.js installed via package managers should work out of the box
+        logger.debug(f"Detected Node.js binary directory: {node_bin_dir}")
+        return node_bin_dir
+
+    def _run_command(
+        self,
+        cmd: List[str],
+        filename,
+    ) -> Generator[Dict, None, None]:
+        """
+        Run a given command using subprocess and parse the output.
+        """
+        logger.debug(f"Running command: {' '.join(cmd)}")
+
+        # Make a copy of the current environment
+        env = os.environ.copy()
+
+        try:
+            node_bin_dir = self._get_node_bin_dir()
+            print(node_bin_dir)
+            env["PATH"] = f"{node_bin_dir}:{env['PATH']}"
+
+        except EnvironmentError as e:
+            logger.error(f"Failed to set Node.js binary directory: {e}")
+            raise
+
+        # Add node's bin directory to the PATH
+        # node_bin_dir = "/Users/vasilije/Library/Application Support/JetBrains/PyCharm2024.2/node/versions/20.15.0/bin"
+        # # env["PATH"] = f"{node_bin_dir}:{env['PATH']}"
+
+        result = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env)
+
+        print(result.stderr)
+        with open(filename, "r", encoding="utf-8") as file:
+            read_data = json.load(file)
+        print(f"{filename} created and written.")
+
+        # Log raw stdout for debugging
+        logger.debug(f"Raw command output:\n{result.stdout}")
+
+        # Use the parse_promptfoo_output function to yield parsed results
+        return read_data
+
+    def run_eval(
+        self,
+        prompt_file: Optional[str] = None,
+        config_file: Optional[str] = None,
+        eval_file: Optional[str] = None,
+        out_format: str = "json",
+        extra_args: Optional[List[str]] = None,
+        binary_output: bool = False,
+    ) -> List[Dict]:
+        """
+        Run the `promptfoo eval` command with the provided parameters and return parsed results.
+
+        :param prompt_file: Path to a file containing one or more prompts.
+        :param config_file: Path to a config file specifying models, scoring methods, etc.
+        :param eval_file: Path to an eval file with test data.
+        :param out_format: Output format, e.g., 'json', 'yaml', or 'table'.
+        :param extra_args: Additional command-line arguments for fine-tuning evaluation.
+        :param binary_output: If True, interpret output as binary data instead of text.
+        :return: List of parsed results (each result is a dictionary).
+        """
+        self._validate_path(prompt_file)
+        self._validate_path(config_file)
+        self._validate_path(eval_file)
+
+        filename = "benchmark_results"
+
+        filename = os.path.join(os.getcwd(), f"{filename}.json")
+
+        cmd = [self.promptfoo_path, "eval"]
+        if prompt_file:
+            cmd.extend(["--prompts", prompt_file])
+        if config_file:
+            cmd.extend(["--config", config_file])
+        if eval_file:
+            cmd.extend(["--eval", eval_file])
+        cmd.extend(["--output", filename])
+        if extra_args:
+            cmd.extend(extra_args)
+
+        # Log the constructed command for debugging
+        logger.debug(f"Constructed command: {' '.join(cmd)}")
+
+        # Collect results from the generator
+        results = self._run_command(cmd, filename=filename)
+        logger.debug(f"Parsed results: {json.dumps(results, indent=4)}")
+        return results
diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml
new file mode 100644
index 000000000..b7dd32764
--- /dev/null
+++ b/evals/promptfooconfig.yaml
@@ -0,0 +1,13 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+
+# Learn more about building a configuration: https://promptfoo.dev/docs/configuration/guide
+
+description: "My eval"
+providers:
+  - id: openai:gpt-4o-mini
+
+defaultTest:
+  assert:
+    # Comprehensiveness
+    - type: llm-rubric
+      value: Determine how much detail the answer provides to cover all the aspects and details of the question.
diff --git a/evals/promptfooprompt.json b/evals/promptfooprompt.json
new file mode 100644
index 000000000..06d5ebc98
--- /dev/null
+++ b/evals/promptfooprompt.json
@@ -0,0 +1,10 @@
+[
+    {
+      "role": "system",
+      "content": "Answer the question using the provided context. Be as brief as possible. Each entry in the context is tuple of length 3, representing an edge of a knowledge graph with its two nodes.."
+    },
+    {
+      "role": "user",
+      "content": "The question is: `{{ question }}` \n And here is the context: `{{ context }}`"
+    }
+]
diff --git a/evals/qa_metrics_utils.py b/evals/qa_metrics_utils.py
new file mode 100644
index 000000000..8d12cfe4a
--- /dev/null
+++ b/evals/qa_metrics_utils.py
@@ -0,0 +1,44 @@
+from evals.deepeval_metrics import (
+    correctness_metric,
+    comprehensiveness_metric,
+    diversity_metric,
+    empowerment_metric,
+    f1_score_metric,
+    em_score_metric,
+)
+from evals.promptfoo_metrics import PromptfooComprehensiveness
+from deepeval.metrics import AnswerRelevancyMetric
+import deepeval.metrics
+
+native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
+
+custom_deepeval_metrics = {
+    "Correctness": correctness_metric,
+    "Comprehensiveness": comprehensiveness_metric,
+    "Diversity": diversity_metric,
+    "Empowerment": empowerment_metric,
+    "F1": f1_score_metric,
+    "EM": em_score_metric,
+}
+
+promptfoo_metrics = {
+    "promptfoo.comprehensiveness": PromptfooComprehensiveness,
+}
+
+qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics
+
+
+def get_metric(metric_name: str):
+    if metric_name in qa_metrics:
+        metric = qa_metrics[metric_name]
+    else:
+        try:
+            metric_cls = getattr(deepeval.metrics, metric_name)
+            metric = metric_cls()
+        except AttributeError:
+            raise Exception(f"Metric {metric_name} not supported")
+
+    if isinstance(metric, type):
+        metric = metric()
+
+    return metric

From 8eedc2bb8d0413feb86feb6b8135619367531816 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Mon, 13 Jan 2025 14:19:58 +0100
Subject: [PATCH 06/15] refactor RAG solution in eval;2C

---
 evals/eval_on_hotpot.py                       | 45 ++++++++-----------
 ...ig.yaml => promptfoo_config_template.yaml} |  0
 evals/promptfoo_metrics.py                    | 12 +++--
 3 files changed, 23 insertions(+), 34 deletions(-)
 rename evals/{promptfooconfig.yaml => promptfoo_config_template.yaml} (100%)

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index d4af283e2..27d9b554f 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -13,23 +13,6 @@
 from evals.qa_metrics_utils import get_metric
 
 
-async def answer_without_cognee(instance):
-    args = {
-        "question": instance["question"],
-        "context": instance["context"],
-    }
-    user_prompt = render_prompt("context_for_question.txt", args)
-    system_prompt = read_query_prompt("answer_hotpot_question.txt")
-
-    llm_client = get_llm_client()
-    answer_prediction = await llm_client.acreate_structured_output(
-        text_input=user_prompt,
-        system_prompt=system_prompt,
-        response_model=str,
-    )
-    return answer_prediction
-
-
 async def get_context_with_cognee(instance):
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
@@ -43,14 +26,22 @@ async def get_context_with_cognee(instance):
         SearchType.SUMMARIES, query_text=instance["question"]
     )
     search_results = search_results + search_results_second
-    return search_results
 
+    search_results_str = "\n".join([context_item["text"] for context_item in search_results])
+
+    return search_results_str
+
+
+async def get_context_without_cognee(instance):
+    return instance["context"]
+
+
+async def answer_qa_instance(instance, context_provider):
+    context = context_provider(instance)
 
-async def answer_with_cognee(instance):
-    search_results = get_context_with_cognee(instance)
     args = {
         "question": instance["question"],
-        "context": search_results,
+        "context": context,
     }
     user_prompt = render_prompt("context_for_question.txt", args)
     system_prompt = read_query_prompt("answer_hotpot_using_cognee_search.txt")
@@ -81,7 +72,7 @@ async def eval_answers(instances, answers, eval_metric):
 
 
 async def eval_on_QA_dataset(
-    dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name
+    dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name
 ):
     dataset = load_qa_dataset(dataset_name_or_filename)
 
@@ -89,11 +80,11 @@ async def eval_on_QA_dataset(
     instances = dataset if not num_samples else dataset[:num_samples]
 
     if eval_metric_name.startswith("promptfoo"):
-        return await eval_metric.measure(instances)
+        return await eval_metric.measure(instances, context_provider)
 
     answers = []
     for instance in tqdm(instances, desc="Getting answers"):
-        answer = await answer_provider(instance)
+        answer = await answer_qa_instance(instance, context_provider)
         answers.append(answer)
 
     eval_results = await eval_answers(instances, answers, eval_metric)
@@ -115,11 +106,11 @@ async def eval_on_QA_dataset(
     args = parser.parse_args()
 
     if args.with_cognee:
-        answer_provider = answer_with_cognee
+        context_provider = get_context_with_cognee
     else:
-        answer_provider = answer_without_cognee
+        context_provider = get_context_without_cognee
 
     avg_score = asyncio.run(
-        eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name)
+        eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name)
     )
     print(f"Average {args.metric_name}: {avg_score}")
diff --git a/evals/promptfooconfig.yaml b/evals/promptfoo_config_template.yaml
similarity index 100%
rename from evals/promptfooconfig.yaml
rename to evals/promptfoo_config_template.yaml
diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py
index 1806c105b..05a18b234 100644
--- a/evals/promptfoo_metrics.py
+++ b/evals/promptfoo_metrics.py
@@ -10,21 +10,19 @@ def __init__(self, threshold: float = 0.5):
         self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo")
         self.threshold = threshold
 
-    async def measure(self, instances):
-        with open(os.path.join(os.getcwd(), "evals/promptfooconfig.yaml"), "r") as file:
+    async def measure(self, instances, context_provider):
+        with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
             config = yaml.safe_load(file)
 
-        # creating config file
+        # Fill config file with test cases
         tests = []
         for instance in instances:
-            from evals.eval_on_hotpot import get_context_with_cognee
-
-            context = await get_context_with_cognee(instance)
+            context = await context_provider(instance)
             test = {
                 "vars": {
                     "name": instance["question"][:15],
                     "question": instance["question"],
-                    "context": str(context),
+                    "context": context,
                 }
             }
             tests.append(test)

From 079c16c8f0c3db7da3279d7ddb6c6d22ab504c68 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Mon, 13 Jan 2025 15:36:23 +0100
Subject: [PATCH 07/15] LLM as a judge metrics implemented in a uniform way

---
 evals/deepeval_metrics.py  | 26 +++++++++++++++-----------
 evals/promptfoo_metrics.py |  9 +++++----
 evals/promptfoo_wrapper.py |  3 +++
 evals/promptfooprompt.json |  2 +-
 evals/qa_metrics_utils.py  | 11 +++++++++--
 5 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/evals/deepeval_metrics.py b/evals/deepeval_metrics.py
index 6ef8e822f..51d6c9181 100644
--- a/evals/deepeval_metrics.py
+++ b/evals/deepeval_metrics.py
@@ -2,14 +2,13 @@
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
 from evals.official_hotpot_metrics import exact_match_score, f1_score
+from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
 
 correctness_metric = GEval(
     name="Correctness",
     model="gpt-4o-mini",
     evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
-    evaluation_steps=[
-        "Determine whether the actual output is factually correct based on the expected output."
-    ],
+    evaluation_steps=[llm_judge_prompts["correctness"]],
 )
 
 comprehensiveness_metric = GEval(
@@ -20,9 +19,7 @@
         LLMTestCaseParams.ACTUAL_OUTPUT,
         LLMTestCaseParams.EXPECTED_OUTPUT,
     ],
-    evaluation_steps=[
-        "Determine how much detail the answer provides to cover all the aspects and details of the question."
-    ],
+    evaluation_steps=[llm_judge_prompts["comprehensiveness"]],
 )
 
 diversity_metric = GEval(
@@ -33,9 +30,7 @@
         LLMTestCaseParams.ACTUAL_OUTPUT,
         LLMTestCaseParams.EXPECTED_OUTPUT,
     ],
-    evaluation_steps=[
-        "Determine how varied and rich the answer is in providing different perspectives and insights on the question."
-    ],
+    evaluation_steps=[llm_judge_prompts["diversity"]],
 )
 
 empowerment_metric = GEval(
@@ -46,9 +41,18 @@
         LLMTestCaseParams.ACTUAL_OUTPUT,
         LLMTestCaseParams.EXPECTED_OUTPUT,
     ],
-    evaluation_steps=[
-        "Determine how well the answer helps the reader understand and make informed judgements about the topic."
+    evaluation_steps=[llm_judge_prompts["empowerment"]],
+)
+
+directness_metric = GEval(
+    name="Directness",
+    model="gpt-4o-mini",
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
     ],
+    evaluation_steps=[llm_judge_prompts["directness"]],
 )
 
 
diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py
index 05a18b234..8c1169d00 100644
--- a/evals/promptfoo_metrics.py
+++ b/evals/promptfoo_metrics.py
@@ -5,15 +5,17 @@
 import json
 
 
-class PromptfooComprehensiveness:
-    def __init__(self, threshold: float = 0.5):
+class PromptfooMetric:
+    def __init__(self, judge_prompt):
         self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo")
-        self.threshold = threshold
+        self.judge_prompt = judge_prompt
 
     async def measure(self, instances, context_provider):
         with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file:
             config = yaml.safe_load(file)
 
+        config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}]
+
         # Fill config file with test cases
         tests = []
         for instance in instances:
@@ -47,5 +49,4 @@ async def measure(self, instances, context_provider):
 
         self.score = results["results"]["prompts"][0]["metrics"]["score"]
 
-        self.success = self.score >= self.threshold
         return self.score
diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py
index 0ed2d4850..32be242e5 100644
--- a/evals/promptfoo_wrapper.py
+++ b/evals/promptfoo_wrapper.py
@@ -133,6 +133,9 @@ def run_eval(
         filename = "benchmark_results"
 
         filename = os.path.join(os.getcwd(), f"{filename}.json")
+        # Create an empty JSON file
+        with open(filename, "w") as file:
+            json.dump({}, file)
 
         cmd = [self.promptfoo_path, "eval"]
         if prompt_file:
diff --git a/evals/promptfooprompt.json b/evals/promptfooprompt.json
index 06d5ebc98..fb6351406 100644
--- a/evals/promptfooprompt.json
+++ b/evals/promptfooprompt.json
@@ -1,7 +1,7 @@
 [
     {
       "role": "system",
-      "content": "Answer the question using the provided context. Be as brief as possible. Each entry in the context is tuple of length 3, representing an edge of a knowledge graph with its two nodes.."
+      "content": "Answer the question using the provided context. Be as brief as possible."
     },
     {
       "role": "user",
diff --git a/evals/qa_metrics_utils.py b/evals/qa_metrics_utils.py
index 8d12cfe4a..107fe429d 100644
--- a/evals/qa_metrics_utils.py
+++ b/evals/qa_metrics_utils.py
@@ -3,12 +3,14 @@
     comprehensiveness_metric,
     diversity_metric,
     empowerment_metric,
+    directness_metric,
     f1_score_metric,
     em_score_metric,
 )
-from evals.promptfoo_metrics import PromptfooComprehensiveness
+from evals.promptfoo_metrics import PromptfooMetric
 from deepeval.metrics import AnswerRelevancyMetric
 import deepeval.metrics
+from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
 
 native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
 
@@ -17,12 +19,17 @@
     "Comprehensiveness": comprehensiveness_metric,
     "Diversity": diversity_metric,
     "Empowerment": empowerment_metric,
+    "Directness": directness_metric,
     "F1": f1_score_metric,
     "EM": em_score_metric,
 }
 
 promptfoo_metrics = {
-    "promptfoo.comprehensiveness": PromptfooComprehensiveness,
+    "promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]),
+    "promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]),
+    "promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]),
+    "promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]),
+    "promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]),
 }
 
 qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics

From 273b16c0a8f65dcb202e8cb2d5fe5266bf52ebd0 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 09:38:41 +0100
Subject: [PATCH 08/15] Use requests.get instead of wget

---
 evals/qa_dataset_utils.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py
index bb6dd5bbb..c570455c4 100644
--- a/evals/qa_dataset_utils.py
+++ b/evals/qa_dataset_utils.py
@@ -1,6 +1,6 @@
 from cognee.root_dir import get_absolute_path
 import json
-import wget
+import requests
 from jsonschema import ValidationError, validate
 from pathlib import Path
 
@@ -31,7 +31,7 @@
 }
 
 
-def download_qa_dataset(dataset_name: str, dir: str):
+def download_qa_dataset(dataset_name: str, filepath: Path):
     if dataset_name not in qa_datasets:
         raise ValueError(f"{dataset_name} is not a supported dataset.")
 
@@ -44,7 +44,15 @@ def download_qa_dataset(dataset_name: str, dir: str):
                         and unzip it."
         )
 
-    wget.download(url, out=dir)
+    response = requests.get(url, stream=True)
+
+    if response.status_code == 200:
+        with open(filepath, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print(f"Dataset {dataset_name} downloaded and saved to {filepath}")
+    else:
+        print(f"Failed to download {dataset_name}. Status code: {response.status_code}")
 
 
 def load_qa_dataset(dataset_name_or_filename: str):
@@ -58,7 +66,7 @@ def load_qa_dataset(dataset_name_or_filename: str):
 
         filepath = data_root_dir / Path(filename)
         if not filepath.exists():
-            download_qa_dataset(dataset_name, data_root_dir)
+            download_qa_dataset(dataset_name, filepath)
     else:
         filename = dataset_name_or_filename
         filepath = Path(filename)

From 66d8850592c2a927718e5193dcc3728cf37c224f Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 09:59:28 +0100
Subject: [PATCH 09/15] clean up promptfoo config template

---
 evals/promptfoo_config_template.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/evals/promptfoo_config_template.yaml b/evals/promptfoo_config_template.yaml
index b7dd32764..f2201fca2 100644
--- a/evals/promptfoo_config_template.yaml
+++ b/evals/promptfoo_config_template.yaml
@@ -5,9 +5,3 @@
 description: "My eval"
 providers:
   - id: openai:gpt-4o-mini
-
-defaultTest:
-  assert:
-    # Comprehensiveness
-    - type: llm-rubric
-      value: Determine how much detail the answer provides to cover all the aspects and details of the question.

From e4145168f0c95a4384c88d663047b99d027cea62 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 10:12:50 +0100
Subject: [PATCH 10/15] minor fixes

---
 evals/eval_on_hotpot.py    | 4 ++--
 evals/promptfoo_wrapper.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index 27d9b554f..cfc29dcc1 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -37,7 +37,7 @@ async def get_context_without_cognee(instance):
 
 
 async def answer_qa_instance(instance, context_provider):
-    context = context_provider(instance)
+    context = await context_provider(instance)
 
     args = {
         "question": instance["question"],
@@ -98,7 +98,7 @@ async def eval_on_QA_dataset(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on")
+    parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on")
     parser.add_argument("--with_cognee", action="store_true")
     parser.add_argument("--num_samples", type=int, default=500)
     parser.add_argument("--metric_name", type=str, default="Correctness")
diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py
index 32be242e5..97a03bbf8 100644
--- a/evals/promptfoo_wrapper.py
+++ b/evals/promptfoo_wrapper.py
@@ -114,7 +114,7 @@ def run_eval(
         out_format: str = "json",
         extra_args: Optional[List[str]] = None,
         binary_output: bool = False,
-    ) -> List[Dict]:
+    ) -> Dict:
         """
         Run the `promptfoo eval` command with the provided parameters and return parsed results.
 

From c95dbb8d4aaee2b08a04ddd3036b940b7b594c0b Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 10:33:03 +0100
Subject: [PATCH 11/15] get promptfoo path instead of hardcoding

---
 evals/promptfoo_metrics.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py
index 8c1169d00..addd0030a 100644
--- a/evals/promptfoo_metrics.py
+++ b/evals/promptfoo_metrics.py
@@ -1,13 +1,14 @@
 from evals.promptfoo_wrapper import PromptfooWrapper
 import os
-from deepeval.test_case import LLMTestCase
 import yaml
 import json
+import shutil
 
 
 class PromptfooMetric:
     def __init__(self, judge_prompt):
-        self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo")
+        promptfoo_path = shutil.which("promptfoo")
+        self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path)
         self.judge_prompt = judge_prompt
 
     async def measure(self, instances, context_provider):

From 14cac1b9751af02b90d3d43ec6d0bf781b8089bd Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 10:45:50 +0100
Subject: [PATCH 12/15] minor fixes

---
 evals/qa_dataset_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py
index c570455c4..ac97a180c 100644
--- a/evals/qa_dataset_utils.py
+++ b/evals/qa_dataset_utils.py
@@ -55,7 +55,7 @@ def download_qa_dataset(dataset_name: str, filepath: Path):
         print(f"Failed to download {dataset_name}. Status code: {response.status_code}")
 
 
-def load_qa_dataset(dataset_name_or_filename: str):
+def load_qa_dataset(dataset_name_or_filename: str) -> list[dict]:
     if dataset_name_or_filename in qa_datasets:
         dataset_name = dataset_name_or_filename
         filename = qa_datasets[dataset_name]["filename"]
@@ -77,6 +77,6 @@ def load_qa_dataset(dataset_name_or_filename: str):
     try:
         validate(instance=dataset, schema=qa_json_schema)
     except ValidationError as e:
-        print("File is not a valid QA dataset:", e.message)
+        raise ValidationError(f"Invalid QA dataset: {e.message}")
 
     return dataset

From 51d460713d1fe64942b6865efdbe08420b3cccb6 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 10:50:19 +0100
Subject: [PATCH 13/15] Add LLM as a judge prompts

---
 cognee/infrastructure/llm/prompts/llm_judge_prompts.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 cognee/infrastructure/llm/prompts/llm_judge_prompts.py

diff --git a/cognee/infrastructure/llm/prompts/llm_judge_prompts.py b/cognee/infrastructure/llm/prompts/llm_judge_prompts.py
new file mode 100644
index 000000000..9b94ebdad
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/llm_judge_prompts.py
@@ -0,0 +1,9 @@
+# LLM-as-a-judge metrics as described here: https://arxiv.org/abs/2404.16130
+
+llm_judge_prompts = {
+    "correctness": "Determine whether the actual output is factually correct based on the expected output.",
+    "comprehensiveness": "Determine how much detail the answer provides to cover all the aspects and details of the question.",
+    "diversity": "Determine how varied and rich the answer is in providing different perspectives and insights on the question.",
+    "empowerment": "Determine how well the answer helps the reader understand and make informed judgements about the topic.",
+    "directness": "Determine how specifically and clearly the answer addresses the question.",
+}

From d20ecd09f78734029ecaa557476634111111bf61 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 14:39:44 +0100
Subject: [PATCH 14/15] Support 4 different rag options in eval

---
 evals/eval_on_hotpot.py            | 46 ++++++----------------
 evals/qa_context_provider_utils.py | 61 ++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 34 deletions(-)
 create mode 100644 evals/qa_context_provider_utils.py

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index cfc29dcc1..ff7580b04 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -5,35 +5,11 @@
 from deepeval.test_case import LLMTestCase
 from tqdm import tqdm
 
-import cognee
-from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
 from evals.qa_dataset_utils import load_qa_dataset
 from evals.qa_metrics_utils import get_metric
-
-
-async def get_context_with_cognee(instance):
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-
-    for title, sentences in instance["context"]:
-        await cognee.add("\n".join(sentences), dataset_name="QA")
-    await cognee.cognify("QA")
-
-    search_results = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
-    search_results_second = await cognee.search(
-        SearchType.SUMMARIES, query_text=instance["question"]
-    )
-    search_results = search_results + search_results_second
-
-    search_results_str = "\n".join([context_item["text"] for context_item in search_results])
-
-    return search_results_str
-
-
-async def get_context_without_cognee(instance):
-    return instance["context"]
+from evals.qa_context_provider_utils import qa_context_providers
 
 
 async def answer_qa_instance(instance, context_provider):
@@ -72,11 +48,12 @@ async def eval_answers(instances, answers, eval_metric):
 
 
 async def eval_on_QA_dataset(
-    dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name
+    dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name
 ):
     dataset = load_qa_dataset(dataset_name_or_filename)
-
+    context_provider = qa_context_providers[context_provider_name]
     eval_metric = get_metric(eval_metric_name)
+
     instances = dataset if not num_samples else dataset[:num_samples]
 
     if eval_metric_name.startswith("promptfoo"):
@@ -99,18 +76,19 @@ async def eval_on_QA_dataset(
     parser = argparse.ArgumentParser()
 
     parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on")
-    parser.add_argument("--with_cognee", action="store_true")
+    parser.add_argument(
+        "--rag_option",
+        type=str,
+        choices=qa_context_providers.keys(),
+        required=True,
+        help="RAG option to use for providing context",
+    )
     parser.add_argument("--num_samples", type=int, default=500)
     parser.add_argument("--metric_name", type=str, default="Correctness")
 
     args = parser.parse_args()
 
-    if args.with_cognee:
-        context_provider = get_context_with_cognee
-    else:
-        context_provider = get_context_without_cognee
-
     avg_score = asyncio.run(
-        eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name)
+        eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name)
     )
     print(f"Average {args.metric_name}: {avg_score}")
diff --git a/evals/qa_context_provider_utils.py b/evals/qa_context_provider_utils.py
new file mode 100644
index 000000000..52d63a4e3
--- /dev/null
+++ b/evals/qa_context_provider_utils.py
@@ -0,0 +1,61 @@
+import cognee
+from cognee.api.v1.search import SearchType
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
+from cognee.tasks.completion.graph_query_completion import retrieved_edges_to_string
+
+
+async def get_context_without_rag(instance: dict) -> str:
+    return instance["context"]
+
+
+async def cognify_instance(instance: dict):
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    for title, sentences in instance["context"]:
+        await cognee.add("\n".join(sentences), dataset_name="QA")
+    await cognee.cognify("QA")
+
+
+async def get_context_with_cognee(instance: dict) -> str:
+    await cognify_instance(instance)
+
+    search_results = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
+    search_results_second = await cognee.search(
+        SearchType.SUMMARIES, query_text=instance["question"]
+    )
+    search_results = search_results + search_results_second
+
+    search_results_str = "\n".join([context_item["text"] for context_item in search_results])
+
+    return search_results_str
+
+
+async def get_context_with_simple_rag(instance: dict) -> str:
+    await cognify_instance(instance)
+
+    vector_engine = get_vector_engine()
+    found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5)
+
+    search_results_str = "\n".join([context_item.payload["text"] for context_item in found_chunks])
+
+    return search_results_str
+
+
+async def get_context_with_brute_force_triplet_search(instance: dict) -> str:
+    await cognify_instance(instance)
+
+    found_triplets = await brute_force_triplet_search(instance["question"], top_k=5)
+
+    search_results_str = retrieved_edges_to_string(found_triplets)
+
+    return search_results_str
+
+
+qa_context_providers = {
+    "no_rag": get_context_without_rag,
+    "cognee": get_context_with_cognee,
+    "simple_rag": get_context_with_simple_rag,
+    "brute_force": get_context_with_brute_force_triplet_search,
+}

From 9c10303e9e2244b00f3b324aed31c6c247835ba2 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 14 Jan 2025 18:31:37 +0100
Subject: [PATCH 15/15] Minor refactor and logger usage

---
 evals/eval_on_hotpot.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py
index cfc29dcc1..54dcaffd0 100644
--- a/evals/eval_on_hotpot.py
+++ b/evals/eval_on_hotpot.py
@@ -4,7 +4,7 @@
 from deepeval.dataset import EvaluationDataset
 from deepeval.test_case import LLMTestCase
 from tqdm import tqdm
-
+import logging
 import cognee
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
@@ -12,6 +12,8 @@
 from evals.qa_dataset_utils import load_qa_dataset
 from evals.qa_metrics_utils import get_metric
 
+logger = logging.getLogger(__name__)
+
 
 async def get_context_with_cognee(instance):
     await cognee.prune.prune_data()
@@ -56,7 +58,7 @@ async def answer_qa_instance(instance, context_provider):
     return answer_prediction
 
 
-async def eval_answers(instances, answers, eval_metric):
+async def deepeval_answers(instances, answers, eval_metric):
     test_cases = []
 
     for instance, answer in zip(instances, answers):
@@ -71,23 +73,13 @@ async def eval_answers(instances, answers, eval_metric):
     return eval_results
 
 
-async def eval_on_QA_dataset(
-    dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name
-):
-    dataset = load_qa_dataset(dataset_name_or_filename)
-
-    eval_metric = get_metric(eval_metric_name)
-    instances = dataset if not num_samples else dataset[:num_samples]
-
-    if eval_metric_name.startswith("promptfoo"):
-        return await eval_metric.measure(instances, context_provider)
-
+async def deepeval_on_instances(instances, context_provider, eval_metric):
     answers = []
     for instance in tqdm(instances, desc="Getting answers"):
         answer = await answer_qa_instance(instance, context_provider)
         answers.append(answer)
 
-    eval_results = await eval_answers(instances, answers, eval_metric)
+    eval_results = await deepeval_answers(instances, answers, eval_metric)
     avg_score = statistics.mean(
         [result.metrics_data[0].score for result in eval_results.test_results]
     )
@@ -95,6 +87,20 @@ async def eval_on_QA_dataset(
     return avg_score
 
 
+async def eval_on_QA_dataset(
+    dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name
+):
+    dataset = load_qa_dataset(dataset_name_or_filename)
+
+    eval_metric = get_metric(eval_metric_name)
+    instances = dataset if not num_samples else dataset[:num_samples]
+
+    if eval_metric_name.startswith("promptfoo"):
+        return await eval_metric.measure(instances, context_provider)
+    else:
+        return await deepeval_on_instances(instances, context_provider, eval_metric)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -113,4 +119,4 @@ async def eval_on_QA_dataset(
     avg_score = asyncio.run(
         eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name)
     )
-    print(f"Average {args.metric_name}: {avg_score}")
+    logger.info(f"Average {args.metric_name}: {avg_score}")