From a67512d742216012bc38b67ce0bb9cc5f3341365 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 12:16:34 +0100 Subject: [PATCH 01/15] QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. --- evals/eval_on_hotpot.py | 78 ++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index e07e80e0c..40f4fe9d3 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -8,15 +8,68 @@ import wget from deepeval.dataset import EvaluationDataset from deepeval.test_case import LLMTestCase +from jsonschema import ValidationError, validate from tqdm import tqdm import cognee import evals.deepeval_metrics from cognee.api.v1.search import SearchType -from cognee.base_config import get_base_config from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt +from cognee.root_dir import get_absolute_path +qa_datasets = { + "hotpotqa": { + "filename": "hotpot_dev_fullwiki_v1.json", + "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json" + }, + "2wikimultihop": { + "filename": "data/dev.json", + "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1" + } +} + +qa_json_schema = { + "type": "array", + "items": { + "type": "object", + "properties": { + "answer": {"type": "string"}, + "question": {"type": "string"}, + "context": {"type": "array"}, + }, + "required": ["answer", "question", "context"], + "additionalProperties": True + } +} + + +def download_qa_dataset(dataset_name: str, dir: str): + + if dataset_name not in qa_datasets: + raise ValueError(f"{dataset_name} is not a supported dataset.") + + url = qa_datasets[dataset_name]["URL"] + + if dataset_name == "2wikimultihop": + raise Exception("Please download 2wikimultihop dataset (data.zip) manually from \ + https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \ + and unzip it.") + + wget.download(url, out=dir) + + +def load_qa_dataset(filepath: Path): + + with open(filepath, "r") as file: + dataset = json.load(file) + + try: + validate(instance=dataset, schema=qa_json_schema) + except ValidationError as e: + print("File is not a valid QA dataset:", e.message) + + return dataset async def answer_without_cognee(instance): args = { @@ -39,9 +92,8 @@ async def answer_with_cognee(instance): await cognee.prune.prune_system(metadata=True) for (title, sentences) in instance["context"]: - await cognee.add("\n".join(sentences), dataset_name = "HotPotQA") - - await cognee.cognify("HotPotQA") + await cognee.add("\n".join(sentences), dataset_name = "QA") + await cognee.cognify("QA") search_results = await cognee.search( SearchType.INSIGHTS, query_text=instance["question"] @@ -80,20 +132,19 @@ async def eval_answers(instances, answers, eval_metric): return eval_results -async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric): - base_config = get_base_config() - data_root_dir = base_config.data_root_directory +async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, eval_metric): + + data_root_dir = get_absolute_path("../.data") if not Path(data_root_dir).exists(): Path(data_root_dir).mkdir() - filepath = data_root_dir / Path("hotpot_dev_fullwiki_v1.json") + filename = qa_datasets[dataset_name]["filename"] + filepath = data_root_dir / Path(filename) if not filepath.exists(): - url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json' - wget.download(url, out=data_root_dir) + download_qa_dataset(dataset_name, data_root_dir) - with open(filepath, "r") as file: - dataset = json.load(file) + dataset = load_qa_dataset(filepath) instances = dataset if not num_samples else dataset[:num_samples] answers = [] @@ -109,6 +160,7 @@ async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric): if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("--dataset", type=str, choices=list(qa_datasets.keys()), help="Which dataset to evaluate on") parser.add_argument("--with_cognee", action="store_true") parser.add_argument("--num_samples", type=int, default=500) parser.add_argument("--metric", type=str, default="correctness_metric", @@ -130,5 +182,5 @@ async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric): else: answer_provider = answer_without_cognee - avg_score = asyncio.run(eval_on_hotpotQA(answer_provider, args.num_samples, metric)) + avg_score = asyncio.run(eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, metric)) print(f"Average {args.metric}: {avg_score}") \ No newline at end of file From e0a8c19172f606128123f6936105b94db7e0e706 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 15:32:22 +0100 Subject: [PATCH 02/15] Load dataset file by filename, outsource utilities --- evals/eval_on_hotpot.py | 80 +++------------------------------------ evals/qa_dataset_utils.py | 74 ++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 74 deletions(-) create mode 100644 evals/qa_dataset_utils.py diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 0558edc99..ee2435e6b 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -1,14 +1,9 @@ import argparse import asyncio -import json import statistics -from pathlib import Path - import deepeval.metrics -import wget from deepeval.dataset import EvaluationDataset from deepeval.test_case import LLMTestCase -from jsonschema import ValidationError, validate from tqdm import tqdm import cognee @@ -16,60 +11,7 @@ from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt -from cognee.root_dir import get_absolute_path - -qa_datasets = { - "hotpotqa": { - "filename": "hotpot_dev_fullwiki_v1.json", - "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json", - }, - "2wikimultihop": { - "filename": "data/dev.json", - "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1", - }, -} - -qa_json_schema = { - "type": "array", - "items": { - "type": "object", - "properties": { - "answer": {"type": "string"}, - "question": {"type": "string"}, - "context": {"type": "array"}, - }, - "required": ["answer", "question", "context"], - "additionalProperties": True, - }, -} - - -def download_qa_dataset(dataset_name: str, dir: str): - if dataset_name not in qa_datasets: - raise ValueError(f"{dataset_name} is not a supported dataset.") - - url = qa_datasets[dataset_name]["URL"] - - if dataset_name == "2wikimultihop": - raise Exception( - "Please download 2wikimultihop dataset (data.zip) manually from \ - https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \ - and unzip it." - ) - - wget.download(url, out=dir) - - -def load_qa_dataset(filepath: Path): - with open(filepath, "r") as file: - dataset = json.load(file) - - try: - validate(instance=dataset, schema=qa_json_schema) - except ValidationError as e: - print("File is not a valid QA dataset:", e.message) - - return dataset +from evals.qa_dataset_utils import load_qa_dataset async def answer_without_cognee(instance): @@ -135,18 +77,10 @@ async def eval_answers(instances, answers, eval_metric): return eval_results -async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, eval_metric): - data_root_dir = get_absolute_path("../.data") - - if not Path(data_root_dir).exists(): - Path(data_root_dir).mkdir() - - filename = qa_datasets[dataset_name]["filename"] - filepath = data_root_dir / Path(filename) - if not filepath.exists(): - download_qa_dataset(dataset_name, data_root_dir) - - dataset = load_qa_dataset(filepath) +async def eval_on_QA_dataset( + dataset_name_or_filename: str, answer_provider, num_samples, eval_metric +): + dataset = load_qa_dataset(dataset_name_or_filename) instances = dataset if not num_samples else dataset[:num_samples] answers = [] @@ -165,9 +99,7 @@ async def eval_on_QA_dataset(dataset_name: str, answer_provider, num_samples, ev if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset", type=str, choices=list(qa_datasets.keys()), help="Which dataset to evaluate on" - ) + parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on") parser.add_argument("--with_cognee", action="store_true") parser.add_argument("--num_samples", type=int, default=500) parser.add_argument( diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py new file mode 100644 index 000000000..bb6dd5bbb --- /dev/null +++ b/evals/qa_dataset_utils.py @@ -0,0 +1,74 @@ +from cognee.root_dir import get_absolute_path +import json +import wget +from jsonschema import ValidationError, validate +from pathlib import Path + + +qa_datasets = { + "hotpotqa": { + "filename": "hotpot_dev_fullwiki_v1.json", + "URL": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json", + }, + "2wikimultihop": { + "filename": "data/dev.json", + "URL": "https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1", + }, +} + +qa_json_schema = { + "type": "array", + "items": { + "type": "object", + "properties": { + "answer": {"type": "string"}, + "question": {"type": "string"}, + "context": {"type": "array"}, + }, + "required": ["answer", "question", "context"], + "additionalProperties": True, + }, +} + + +def download_qa_dataset(dataset_name: str, dir: str): + if dataset_name not in qa_datasets: + raise ValueError(f"{dataset_name} is not a supported dataset.") + + url = qa_datasets[dataset_name]["URL"] + + if dataset_name == "2wikimultihop": + raise Exception( + "Please download 2wikimultihop dataset (data.zip) manually from \ + https://www.dropbox.com/scl/fi/heid2pkiswhfaqr5g0piw/data.zip?rlkey=ira57daau8lxfj022xvk1irju&e=1 \ + and unzip it." + ) + + wget.download(url, out=dir) + + +def load_qa_dataset(dataset_name_or_filename: str): + if dataset_name_or_filename in qa_datasets: + dataset_name = dataset_name_or_filename + filename = qa_datasets[dataset_name]["filename"] + + data_root_dir = get_absolute_path("../.data") + if not Path(data_root_dir).exists(): + Path(data_root_dir).mkdir() + + filepath = data_root_dir / Path(filename) + if not filepath.exists(): + download_qa_dataset(dataset_name, data_root_dir) + else: + filename = dataset_name_or_filename + filepath = Path(filename) + + with open(filepath, "r") as file: + dataset = json.load(file) + + try: + validate(instance=dataset, schema=qa_json_schema) + except ValidationError as e: + print("File is not a valid QA dataset:", e.message) + + return dataset From 49fb0535036d97f3c9fa50fbfa03b2bdd368a743 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Sat, 11 Jan 2025 13:48:50 +0100 Subject: [PATCH 03/15] restructure metric selection --- evals/eval_on_hotpot.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index ee2435e6b..6c924c84c 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -12,6 +12,7 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from evals.qa_dataset_utils import load_qa_dataset +from evals.qa_metrics_utils import get_metric async def answer_without_cognee(instance): @@ -78,9 +79,10 @@ async def eval_answers(instances, answers, eval_metric): async def eval_on_QA_dataset( - dataset_name_or_filename: str, answer_provider, num_samples, eval_metric + dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name ): dataset = load_qa_dataset(dataset_name_or_filename) + eval_metric = get_metric(eval_metric_name) instances = dataset if not num_samples else dataset[:num_samples] answers = [] @@ -102,30 +104,16 @@ async def eval_on_QA_dataset( parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on") parser.add_argument("--with_cognee", action="store_true") parser.add_argument("--num_samples", type=int, default=500) - parser.add_argument( - "--metric", - type=str, - default="correctness_metric", - help="Valid options are Deepeval metrics (e.g. AnswerRelevancyMetric) \ - and metrics defined in evals/deepeval_metrics.py, e.g. f1_score_metric", - ) + parser.add_argument("--metric_name", type=str, default="Correctness") args = parser.parse_args() - try: - metric_cls = getattr(deepeval.metrics, args.metric) - metric = metric_cls() - except AttributeError: - metric = getattr(evals.deepeval_metrics, args.metric) - if isinstance(metric, type): - metric = metric() - if args.with_cognee: answer_provider = answer_with_cognee else: answer_provider = answer_without_cognee avg_score = asyncio.run( - eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, metric) + eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name) ) print(f"Average {args.metric}: {avg_score}") From 13422ba4aef382b0d14941109e3a0d5960135104 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Sat, 11 Jan 2025 14:24:07 +0100 Subject: [PATCH 04/15] Add comprehensiveness, diversity and empowerment metrics --- evals/deepeval_metrics.py | 39 +++++++++++++++++++++++++++++++++++++++ evals/eval_on_hotpot.py | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/evals/deepeval_metrics.py b/evals/deepeval_metrics.py index 9ce1e9e4f..6ef8e822f 100644 --- a/evals/deepeval_metrics.py +++ b/evals/deepeval_metrics.py @@ -12,6 +12,45 @@ ], ) +comprehensiveness_metric = GEval( + name="Comprehensiveness", + model="gpt-4o-mini", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + evaluation_steps=[ + "Determine how much detail the answer provides to cover all the aspects and details of the question." + ], +) + +diversity_metric = GEval( + name="Diversity", + model="gpt-4o-mini", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + evaluation_steps=[ + "Determine how varied and rich the answer is in providing different perspectives and insights on the question." + ], +) + +empowerment_metric = GEval( + name="Empowerment", + model="gpt-4o-mini", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + evaluation_steps=[ + "Determine how well the answer helps the reader understand and make informed judgements about the topic." + ], +) + class f1_score_metric(BaseMetric): """F1 score taken directly from the official hotpot benchmark diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 6c924c84c..20a965106 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -116,4 +116,4 @@ async def eval_on_QA_dataset( avg_score = asyncio.run( eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name) ) - print(f"Average {args.metric}: {avg_score}") + print(f"Average {args.metric_name}: {avg_score}") From d57609db4b59492519c0df7a799bd0e216fe5056 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 13 Jan 2025 10:37:19 +0100 Subject: [PATCH 05/15] add promptfoo as an option --- evals/eval_on_hotpot.py | 14 +++- evals/promptfoo_metrics.py | 53 +++++++++++++ evals/promptfoo_wrapper.py | 154 +++++++++++++++++++++++++++++++++++++ evals/promptfooconfig.yaml | 13 ++++ evals/promptfooprompt.json | 10 +++ evals/qa_metrics_utils.py | 44 +++++++++++ 6 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 evals/promptfoo_metrics.py create mode 100644 evals/promptfoo_wrapper.py create mode 100644 evals/promptfooconfig.yaml create mode 100644 evals/promptfooprompt.json create mode 100644 evals/qa_metrics_utils.py diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 20a965106..d4af283e2 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -1,13 +1,11 @@ import argparse import asyncio import statistics -import deepeval.metrics from deepeval.dataset import EvaluationDataset from deepeval.test_case import LLMTestCase from tqdm import tqdm import cognee -import evals.deepeval_metrics from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt @@ -32,7 +30,7 @@ async def answer_without_cognee(instance): return answer_prediction -async def answer_with_cognee(instance): +async def get_context_with_cognee(instance): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -45,7 +43,11 @@ async def answer_with_cognee(instance): SearchType.SUMMARIES, query_text=instance["question"] ) search_results = search_results + search_results_second + return search_results + +async def answer_with_cognee(instance): + search_results = get_context_with_cognee(instance) args = { "question": instance["question"], "context": search_results, @@ -82,9 +84,13 @@ async def eval_on_QA_dataset( dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name ): dataset = load_qa_dataset(dataset_name_or_filename) - eval_metric = get_metric(eval_metric_name) + eval_metric = get_metric(eval_metric_name) instances = dataset if not num_samples else dataset[:num_samples] + + if eval_metric_name.startswith("promptfoo"): + return await eval_metric.measure(instances) + answers = [] for instance in tqdm(instances, desc="Getting answers"): answer = await answer_provider(instance) diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py new file mode 100644 index 000000000..1806c105b --- /dev/null +++ b/evals/promptfoo_metrics.py @@ -0,0 +1,53 @@ +from evals.promptfoo_wrapper import PromptfooWrapper +import os +from deepeval.test_case import LLMTestCase +import yaml +import json + + +class PromptfooComprehensiveness: + def __init__(self, threshold: float = 0.5): + self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo") + self.threshold = threshold + + async def measure(self, instances): + with open(os.path.join(os.getcwd(), "evals/promptfooconfig.yaml"), "r") as file: + config = yaml.safe_load(file) + + # creating config file + tests = [] + for instance in instances: + from evals.eval_on_hotpot import get_context_with_cognee + + context = await get_context_with_cognee(instance) + test = { + "vars": { + "name": instance["question"][:15], + "question": instance["question"], + "context": str(context), + } + } + tests.append(test) + config["tests"] = tests + + # Write the updated YAML back, preserving formatting and structure + updated_yaml_file_path = os.path.join(os.getcwd(), "config_with_context.yaml") + with open(updated_yaml_file_path, "w") as file: + yaml.dump(config, file) + + self.wrapper.run_eval( + prompt_file=os.path.join(os.getcwd(), "evals/promptfooprompt.json"), + config_file=os.path.join(os.getcwd(), "config_with_context.yaml"), + out_format="json", + ) + + file_path = os.path.join(os.getcwd(), "benchmark_results.json") + + # Read and parse the JSON file + with open(file_path, "r") as file: + results = json.load(file) + + self.score = results["results"]["prompts"][0]["metrics"]["score"] + + self.success = self.score >= self.threshold + return self.score diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py new file mode 100644 index 000000000..0ed2d4850 --- /dev/null +++ b/evals/promptfoo_wrapper.py @@ -0,0 +1,154 @@ +import subprocess +import json +import logging +import os +from typing import List, Optional, Dict, Generator +import shutil +import platform +from dotenv import load_dotenv + +logger = logging.getLogger(__name__) + +# Load environment variables from .env file +load_dotenv() + + +class PromptfooWrapper: + """ + A Python wrapper class around the promptfoo CLI tool, allowing you to: + - Evaluate prompts against different language models. + - Compare responses from multiple models. + - Pass configuration and prompt files. + - Retrieve the outputs in a structured format, including binary output if needed. + + This class assumes you have the promptfoo CLI installed and accessible in your environment. + For more details on promptfoo, see: https://github.com/promptfoo/promptfoo + """ + + def __init__(self, promptfoo_path: str = ""): + """ + Initialize the wrapper with the path to the promptfoo executable. + + :param promptfoo_path: Path to the promptfoo binary (default: 'promptfoo') + """ + self.promptfoo_path = promptfoo_path + logger.debug(f"Initialized PromptfooWrapper with binary at: {self.promptfoo_path}") + + def _validate_path(self, file_path: Optional[str]) -> None: + """ + Validate that a file path is accessible if provided. + Raise FileNotFoundError if it does not exist. + """ + if file_path and not os.path.isfile(file_path): + logger.error(f"File not found: {file_path}") + raise FileNotFoundError(f"File not found: {file_path}") + + def _get_node_bin_dir(self) -> str: + """ + Determine the Node.js binary directory dynamically for macOS and Linux. + """ + node_executable = shutil.which("node") + if not node_executable: + logger.error("Node.js is not installed or not found in the system PATH.") + raise EnvironmentError("Node.js is not installed or not in PATH.") + + # Determine the Node.js binary directory + node_bin_dir = os.path.dirname(node_executable) + + # Special handling for macOS, where Homebrew installs Node in /usr/local or /opt/homebrew + if platform.system() == "Darwin": # macOS + logger.debug("Running on macOS") + brew_prefix = os.popen("brew --prefix node").read().strip() + if brew_prefix and os.path.exists(brew_prefix): + node_bin_dir = os.path.join(brew_prefix, "bin") + logger.debug(f"Detected Node.js binary directory using Homebrew: {node_bin_dir}") + + # For Linux, Node.js installed via package managers should work out of the box + logger.debug(f"Detected Node.js binary directory: {node_bin_dir}") + return node_bin_dir + + def _run_command( + self, + cmd: List[str], + filename, + ) -> Generator[Dict, None, None]: + """ + Run a given command using subprocess and parse the output. + """ + logger.debug(f"Running command: {' '.join(cmd)}") + + # Make a copy of the current environment + env = os.environ.copy() + + try: + node_bin_dir = self._get_node_bin_dir() + print(node_bin_dir) + env["PATH"] = f"{node_bin_dir}:{env['PATH']}" + + except EnvironmentError as e: + logger.error(f"Failed to set Node.js binary directory: {e}") + raise + + # Add node's bin directory to the PATH + # node_bin_dir = "/Users/vasilije/Library/Application Support/JetBrains/PyCharm2024.2/node/versions/20.15.0/bin" + # # env["PATH"] = f"{node_bin_dir}:{env['PATH']}" + + result = subprocess.run(cmd, capture_output=True, text=True, check=False, env=env) + + print(result.stderr) + with open(filename, "r", encoding="utf-8") as file: + read_data = json.load(file) + print(f"{filename} created and written.") + + # Log raw stdout for debugging + logger.debug(f"Raw command output:\n{result.stdout}") + + # Use the parse_promptfoo_output function to yield parsed results + return read_data + + def run_eval( + self, + prompt_file: Optional[str] = None, + config_file: Optional[str] = None, + eval_file: Optional[str] = None, + out_format: str = "json", + extra_args: Optional[List[str]] = None, + binary_output: bool = False, + ) -> List[Dict]: + """ + Run the `promptfoo eval` command with the provided parameters and return parsed results. + + :param prompt_file: Path to a file containing one or more prompts. + :param config_file: Path to a config file specifying models, scoring methods, etc. + :param eval_file: Path to an eval file with test data. + :param out_format: Output format, e.g., 'json', 'yaml', or 'table'. + :param extra_args: Additional command-line arguments for fine-tuning evaluation. + :param binary_output: If True, interpret output as binary data instead of text. + :return: List of parsed results (each result is a dictionary). + """ + self._validate_path(prompt_file) + self._validate_path(config_file) + self._validate_path(eval_file) + + filename = "benchmark_results" + + filename = os.path.join(os.getcwd(), f"{filename}.json") + + cmd = [self.promptfoo_path, "eval"] + if prompt_file: + cmd.extend(["--prompts", prompt_file]) + if config_file: + cmd.extend(["--config", config_file]) + if eval_file: + cmd.extend(["--eval", eval_file]) + cmd.extend(["--output", filename]) + if extra_args: + cmd.extend(extra_args) + + # Log the constructed command for debugging + logger.debug(f"Constructed command: {' '.join(cmd)}") + + # Collect results from the generator + results = self._run_command(cmd, filename=filename) + logger.debug(f"Parsed results: {json.dumps(results, indent=4)}") + return results diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml new file mode 100644 index 000000000..b7dd32764 --- /dev/null +++ b/evals/promptfooconfig.yaml @@ -0,0 +1,13 @@ +# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json + +# Learn more about building a configuration: https://promptfoo.dev/docs/configuration/guide + +description: "My eval" +providers: + - id: openai:gpt-4o-mini + +defaultTest: + assert: + # Comprehensiveness + - type: llm-rubric + value: Determine how much detail the answer provides to cover all the aspects and details of the question. diff --git a/evals/promptfooprompt.json b/evals/promptfooprompt.json new file mode 100644 index 000000000..06d5ebc98 --- /dev/null +++ b/evals/promptfooprompt.json @@ -0,0 +1,10 @@ +[ + { + "role": "system", + "content": "Answer the question using the provided context. Be as brief as possible. Each entry in the context is tuple of length 3, representing an edge of a knowledge graph with its two nodes.." + }, + { + "role": "user", + "content": "The question is: `{{ question }}` \n And here is the context: `{{ context }}`" + } +] diff --git a/evals/qa_metrics_utils.py b/evals/qa_metrics_utils.py new file mode 100644 index 000000000..8d12cfe4a --- /dev/null +++ b/evals/qa_metrics_utils.py @@ -0,0 +1,44 @@ +from evals.deepeval_metrics import ( + correctness_metric, + comprehensiveness_metric, + diversity_metric, + empowerment_metric, + f1_score_metric, + em_score_metric, +) +from evals.promptfoo_metrics import PromptfooComprehensiveness +from deepeval.metrics import AnswerRelevancyMetric +import deepeval.metrics + +native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric} + +custom_deepeval_metrics = { + "Correctness": correctness_metric, + "Comprehensiveness": comprehensiveness_metric, + "Diversity": diversity_metric, + "Empowerment": empowerment_metric, + "F1": f1_score_metric, + "EM": em_score_metric, +} + +promptfoo_metrics = { + "promptfoo.comprehensiveness": PromptfooComprehensiveness, +} + +qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics + + +def get_metric(metric_name: str): + if metric_name in qa_metrics: + metric = qa_metrics[metric_name] + else: + try: + metric_cls = getattr(deepeval.metrics, metric_name) + metric = metric_cls() + except AttributeError: + raise Exception(f"Metric {metric_name} not supported") + + if isinstance(metric, type): + metric = metric() + + return metric From 8eedc2bb8d0413feb86feb6b8135619367531816 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 13 Jan 2025 14:19:58 +0100 Subject: [PATCH 06/15] refactor RAG solution in eval;2C --- evals/eval_on_hotpot.py | 45 ++++++++----------- ...ig.yaml => promptfoo_config_template.yaml} | 0 evals/promptfoo_metrics.py | 12 +++-- 3 files changed, 23 insertions(+), 34 deletions(-) rename evals/{promptfooconfig.yaml => promptfoo_config_template.yaml} (100%) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index d4af283e2..27d9b554f 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -13,23 +13,6 @@ from evals.qa_metrics_utils import get_metric -async def answer_without_cognee(instance): - args = { - "question": instance["question"], - "context": instance["context"], - } - user_prompt = render_prompt("context_for_question.txt", args) - system_prompt = read_query_prompt("answer_hotpot_question.txt") - - llm_client = get_llm_client() - answer_prediction = await llm_client.acreate_structured_output( - text_input=user_prompt, - system_prompt=system_prompt, - response_model=str, - ) - return answer_prediction - - async def get_context_with_cognee(instance): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) @@ -43,14 +26,22 @@ async def get_context_with_cognee(instance): SearchType.SUMMARIES, query_text=instance["question"] ) search_results = search_results + search_results_second - return search_results + search_results_str = "\n".join([context_item["text"] for context_item in search_results]) + + return search_results_str + + +async def get_context_without_cognee(instance): + return instance["context"] + + +async def answer_qa_instance(instance, context_provider): + context = context_provider(instance) -async def answer_with_cognee(instance): - search_results = get_context_with_cognee(instance) args = { "question": instance["question"], - "context": search_results, + "context": context, } user_prompt = render_prompt("context_for_question.txt", args) system_prompt = read_query_prompt("answer_hotpot_using_cognee_search.txt") @@ -81,7 +72,7 @@ async def eval_answers(instances, answers, eval_metric): async def eval_on_QA_dataset( - dataset_name_or_filename: str, answer_provider, num_samples, eval_metric_name + dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name ): dataset = load_qa_dataset(dataset_name_or_filename) @@ -89,11 +80,11 @@ async def eval_on_QA_dataset( instances = dataset if not num_samples else dataset[:num_samples] if eval_metric_name.startswith("promptfoo"): - return await eval_metric.measure(instances) + return await eval_metric.measure(instances, context_provider) answers = [] for instance in tqdm(instances, desc="Getting answers"): - answer = await answer_provider(instance) + answer = await answer_qa_instance(instance, context_provider) answers.append(answer) eval_results = await eval_answers(instances, answers, eval_metric) @@ -115,11 +106,11 @@ async def eval_on_QA_dataset( args = parser.parse_args() if args.with_cognee: - answer_provider = answer_with_cognee + context_provider = get_context_with_cognee else: - answer_provider = answer_without_cognee + context_provider = get_context_without_cognee avg_score = asyncio.run( - eval_on_QA_dataset(args.dataset, answer_provider, args.num_samples, args.metric_name) + eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name) ) print(f"Average {args.metric_name}: {avg_score}") diff --git a/evals/promptfooconfig.yaml b/evals/promptfoo_config_template.yaml similarity index 100% rename from evals/promptfooconfig.yaml rename to evals/promptfoo_config_template.yaml diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index 1806c105b..05a18b234 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -10,21 +10,19 @@ def __init__(self, threshold: float = 0.5): self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo") self.threshold = threshold - async def measure(self, instances): - with open(os.path.join(os.getcwd(), "evals/promptfooconfig.yaml"), "r") as file: + async def measure(self, instances, context_provider): + with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file: config = yaml.safe_load(file) - # creating config file + # Fill config file with test cases tests = [] for instance in instances: - from evals.eval_on_hotpot import get_context_with_cognee - - context = await get_context_with_cognee(instance) + context = await context_provider(instance) test = { "vars": { "name": instance["question"][:15], "question": instance["question"], - "context": str(context), + "context": context, } } tests.append(test) From 079c16c8f0c3db7da3279d7ddb6c6d22ab504c68 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 13 Jan 2025 15:36:23 +0100 Subject: [PATCH 07/15] LLM as a judge metrics implemented in a uniform way --- evals/deepeval_metrics.py | 26 +++++++++++++++----------- evals/promptfoo_metrics.py | 9 +++++---- evals/promptfoo_wrapper.py | 3 +++ evals/promptfooprompt.json | 2 +- evals/qa_metrics_utils.py | 11 +++++++++-- 5 files changed, 33 insertions(+), 18 deletions(-) diff --git a/evals/deepeval_metrics.py b/evals/deepeval_metrics.py index 6ef8e822f..51d6c9181 100644 --- a/evals/deepeval_metrics.py +++ b/evals/deepeval_metrics.py @@ -2,14 +2,13 @@ from deepeval.test_case import LLMTestCase, LLMTestCaseParams from evals.official_hotpot_metrics import exact_match_score, f1_score +from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts correctness_metric = GEval( name="Correctness", model="gpt-4o-mini", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], - evaluation_steps=[ - "Determine whether the actual output is factually correct based on the expected output." - ], + evaluation_steps=[llm_judge_prompts["correctness"]], ) comprehensiveness_metric = GEval( @@ -20,9 +19,7 @@ LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT, ], - evaluation_steps=[ - "Determine how much detail the answer provides to cover all the aspects and details of the question." - ], + evaluation_steps=[llm_judge_prompts["comprehensiveness"]], ) diversity_metric = GEval( @@ -33,9 +30,7 @@ LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT, ], - evaluation_steps=[ - "Determine how varied and rich the answer is in providing different perspectives and insights on the question." - ], + evaluation_steps=[llm_judge_prompts["diversity"]], ) empowerment_metric = GEval( @@ -46,9 +41,18 @@ LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT, ], - evaluation_steps=[ - "Determine how well the answer helps the reader understand and make informed judgements about the topic." + evaluation_steps=[llm_judge_prompts["empowerment"]], +) + +directness_metric = GEval( + name="Directness", + model="gpt-4o-mini", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, ], + evaluation_steps=[llm_judge_prompts["directness"]], ) diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index 05a18b234..8c1169d00 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -5,15 +5,17 @@ import json -class PromptfooComprehensiveness: - def __init__(self, threshold: float = 0.5): +class PromptfooMetric: + def __init__(self, judge_prompt): self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo") - self.threshold = threshold + self.judge_prompt = judge_prompt async def measure(self, instances, context_provider): with open(os.path.join(os.getcwd(), "evals/promptfoo_config_template.yaml"), "r") as file: config = yaml.safe_load(file) + config["defaultTest"] = [{"assert": {"type": "llm_rubric", "value": self.judge_prompt}}] + # Fill config file with test cases tests = [] for instance in instances: @@ -47,5 +49,4 @@ async def measure(self, instances, context_provider): self.score = results["results"]["prompts"][0]["metrics"]["score"] - self.success = self.score >= self.threshold return self.score diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py index 0ed2d4850..32be242e5 100644 --- a/evals/promptfoo_wrapper.py +++ b/evals/promptfoo_wrapper.py @@ -133,6 +133,9 @@ def run_eval( filename = "benchmark_results" filename = os.path.join(os.getcwd(), f"{filename}.json") + # Create an empty JSON file + with open(filename, "w") as file: + json.dump({}, file) cmd = [self.promptfoo_path, "eval"] if prompt_file: diff --git a/evals/promptfooprompt.json b/evals/promptfooprompt.json index 06d5ebc98..fb6351406 100644 --- a/evals/promptfooprompt.json +++ b/evals/promptfooprompt.json @@ -1,7 +1,7 @@ [ { "role": "system", - "content": "Answer the question using the provided context. Be as brief as possible. Each entry in the context is tuple of length 3, representing an edge of a knowledge graph with its two nodes.." + "content": "Answer the question using the provided context. Be as brief as possible." }, { "role": "user", diff --git a/evals/qa_metrics_utils.py b/evals/qa_metrics_utils.py index 8d12cfe4a..107fe429d 100644 --- a/evals/qa_metrics_utils.py +++ b/evals/qa_metrics_utils.py @@ -3,12 +3,14 @@ comprehensiveness_metric, diversity_metric, empowerment_metric, + directness_metric, f1_score_metric, em_score_metric, ) -from evals.promptfoo_metrics import PromptfooComprehensiveness +from evals.promptfoo_metrics import PromptfooMetric from deepeval.metrics import AnswerRelevancyMetric import deepeval.metrics +from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric} @@ -17,12 +19,17 @@ "Comprehensiveness": comprehensiveness_metric, "Diversity": diversity_metric, "Empowerment": empowerment_metric, + "Directness": directness_metric, "F1": f1_score_metric, "EM": em_score_metric, } promptfoo_metrics = { - "promptfoo.comprehensiveness": PromptfooComprehensiveness, + "promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]), + "promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]), + "promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]), + "promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]), + "promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]), } qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics From 273b16c0a8f65dcb202e8cb2d5fe5266bf52ebd0 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 09:38:41 +0100 Subject: [PATCH 08/15] Use requests.get instead of wget --- evals/qa_dataset_utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py index bb6dd5bbb..c570455c4 100644 --- a/evals/qa_dataset_utils.py +++ b/evals/qa_dataset_utils.py @@ -1,6 +1,6 @@ from cognee.root_dir import get_absolute_path import json -import wget +import requests from jsonschema import ValidationError, validate from pathlib import Path @@ -31,7 +31,7 @@ } -def download_qa_dataset(dataset_name: str, dir: str): +def download_qa_dataset(dataset_name: str, filepath: Path): if dataset_name not in qa_datasets: raise ValueError(f"{dataset_name} is not a supported dataset.") @@ -44,7 +44,15 @@ def download_qa_dataset(dataset_name: str, dir: str): and unzip it." ) - wget.download(url, out=dir) + response = requests.get(url, stream=True) + + if response.status_code == 200: + with open(filepath, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + print(f"Dataset {dataset_name} downloaded and saved to {filepath}") + else: + print(f"Failed to download {dataset_name}. Status code: {response.status_code}") def load_qa_dataset(dataset_name_or_filename: str): @@ -58,7 +66,7 @@ def load_qa_dataset(dataset_name_or_filename: str): filepath = data_root_dir / Path(filename) if not filepath.exists(): - download_qa_dataset(dataset_name, data_root_dir) + download_qa_dataset(dataset_name, filepath) else: filename = dataset_name_or_filename filepath = Path(filename) From 66d8850592c2a927718e5193dcc3728cf37c224f Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 09:59:28 +0100 Subject: [PATCH 09/15] clean up promptfoo config template --- evals/promptfoo_config_template.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/evals/promptfoo_config_template.yaml b/evals/promptfoo_config_template.yaml index b7dd32764..f2201fca2 100644 --- a/evals/promptfoo_config_template.yaml +++ b/evals/promptfoo_config_template.yaml @@ -5,9 +5,3 @@ description: "My eval" providers: - id: openai:gpt-4o-mini - -defaultTest: - assert: - # Comprehensiveness - - type: llm-rubric - value: Determine how much detail the answer provides to cover all the aspects and details of the question. From e4145168f0c95a4384c88d663047b99d027cea62 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 10:12:50 +0100 Subject: [PATCH 10/15] minor fixes --- evals/eval_on_hotpot.py | 4 ++-- evals/promptfoo_wrapper.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 27d9b554f..cfc29dcc1 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -37,7 +37,7 @@ async def get_context_without_cognee(instance): async def answer_qa_instance(instance, context_provider): - context = context_provider(instance) + context = await context_provider(instance) args = { "question": instance["question"], @@ -98,7 +98,7 @@ async def eval_on_QA_dataset( if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--dataset", type=str, help="Which dataset to evaluate on") + parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on") parser.add_argument("--with_cognee", action="store_true") parser.add_argument("--num_samples", type=int, default=500) parser.add_argument("--metric_name", type=str, default="Correctness") diff --git a/evals/promptfoo_wrapper.py b/evals/promptfoo_wrapper.py index 32be242e5..97a03bbf8 100644 --- a/evals/promptfoo_wrapper.py +++ b/evals/promptfoo_wrapper.py @@ -114,7 +114,7 @@ def run_eval( out_format: str = "json", extra_args: Optional[List[str]] = None, binary_output: bool = False, - ) -> List[Dict]: + ) -> Dict: """ Run the `promptfoo eval` command with the provided parameters and return parsed results. From c95dbb8d4aaee2b08a04ddd3036b940b7b594c0b Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 10:33:03 +0100 Subject: [PATCH 11/15] get promptfoo path instead of hardcoding --- evals/promptfoo_metrics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evals/promptfoo_metrics.py b/evals/promptfoo_metrics.py index 8c1169d00..addd0030a 100644 --- a/evals/promptfoo_metrics.py +++ b/evals/promptfoo_metrics.py @@ -1,13 +1,14 @@ from evals.promptfoo_wrapper import PromptfooWrapper import os -from deepeval.test_case import LLMTestCase import yaml import json +import shutil class PromptfooMetric: def __init__(self, judge_prompt): - self.wrapper = PromptfooWrapper(promptfoo_path="/opt/homebrew/bin/promptfoo") + promptfoo_path = shutil.which("promptfoo") + self.wrapper = PromptfooWrapper(promptfoo_path=promptfoo_path) self.judge_prompt = judge_prompt async def measure(self, instances, context_provider): From 14cac1b9751af02b90d3d43ec6d0bf781b8089bd Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 10:45:50 +0100 Subject: [PATCH 12/15] minor fixes --- evals/qa_dataset_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/qa_dataset_utils.py b/evals/qa_dataset_utils.py index c570455c4..ac97a180c 100644 --- a/evals/qa_dataset_utils.py +++ b/evals/qa_dataset_utils.py @@ -55,7 +55,7 @@ def download_qa_dataset(dataset_name: str, filepath: Path): print(f"Failed to download {dataset_name}. Status code: {response.status_code}") -def load_qa_dataset(dataset_name_or_filename: str): +def load_qa_dataset(dataset_name_or_filename: str) -> list[dict]: if dataset_name_or_filename in qa_datasets: dataset_name = dataset_name_or_filename filename = qa_datasets[dataset_name]["filename"] @@ -77,6 +77,6 @@ def load_qa_dataset(dataset_name_or_filename: str): try: validate(instance=dataset, schema=qa_json_schema) except ValidationError as e: - print("File is not a valid QA dataset:", e.message) + raise ValidationError(f"Invalid QA dataset: {e.message}") return dataset From 51d460713d1fe64942b6865efdbe08420b3cccb6 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 10:50:19 +0100 Subject: [PATCH 13/15] Add LLM as a judge prompts --- cognee/infrastructure/llm/prompts/llm_judge_prompts.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 cognee/infrastructure/llm/prompts/llm_judge_prompts.py diff --git a/cognee/infrastructure/llm/prompts/llm_judge_prompts.py b/cognee/infrastructure/llm/prompts/llm_judge_prompts.py new file mode 100644 index 000000000..9b94ebdad --- /dev/null +++ b/cognee/infrastructure/llm/prompts/llm_judge_prompts.py @@ -0,0 +1,9 @@ +# LLM-as-a-judge metrics as described here: https://arxiv.org/abs/2404.16130 + +llm_judge_prompts = { + "correctness": "Determine whether the actual output is factually correct based on the expected output.", + "comprehensiveness": "Determine how much detail the answer provides to cover all the aspects and details of the question.", + "diversity": "Determine how varied and rich the answer is in providing different perspectives and insights on the question.", + "empowerment": "Determine how well the answer helps the reader understand and make informed judgements about the topic.", + "directness": "Determine how specifically and clearly the answer addresses the question.", +} From d20ecd09f78734029ecaa557476634111111bf61 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 14:39:44 +0100 Subject: [PATCH 14/15] Support 4 different rag options in eval --- evals/eval_on_hotpot.py | 46 ++++++---------------- evals/qa_context_provider_utils.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 34 deletions(-) create mode 100644 evals/qa_context_provider_utils.py diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index cfc29dcc1..ff7580b04 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -5,35 +5,11 @@ from deepeval.test_case import LLMTestCase from tqdm import tqdm -import cognee -from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from evals.qa_dataset_utils import load_qa_dataset from evals.qa_metrics_utils import get_metric - - -async def get_context_with_cognee(instance): - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - for title, sentences in instance["context"]: - await cognee.add("\n".join(sentences), dataset_name="QA") - await cognee.cognify("QA") - - search_results = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"]) - search_results_second = await cognee.search( - SearchType.SUMMARIES, query_text=instance["question"] - ) - search_results = search_results + search_results_second - - search_results_str = "\n".join([context_item["text"] for context_item in search_results]) - - return search_results_str - - -async def get_context_without_cognee(instance): - return instance["context"] +from evals.qa_context_provider_utils import qa_context_providers async def answer_qa_instance(instance, context_provider): @@ -72,11 +48,12 @@ async def eval_answers(instances, answers, eval_metric): async def eval_on_QA_dataset( - dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name + dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name ): dataset = load_qa_dataset(dataset_name_or_filename) - + context_provider = qa_context_providers[context_provider_name] eval_metric = get_metric(eval_metric_name) + instances = dataset if not num_samples else dataset[:num_samples] if eval_metric_name.startswith("promptfoo"): @@ -99,18 +76,19 @@ async def eval_on_QA_dataset( parser = argparse.ArgumentParser() parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on") - parser.add_argument("--with_cognee", action="store_true") + parser.add_argument( + "--rag_option", + type=str, + choices=qa_context_providers.keys(), + required=True, + help="RAG option to use for providing context", + ) parser.add_argument("--num_samples", type=int, default=500) parser.add_argument("--metric_name", type=str, default="Correctness") args = parser.parse_args() - if args.with_cognee: - context_provider = get_context_with_cognee - else: - context_provider = get_context_without_cognee - avg_score = asyncio.run( - eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name) + eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name) ) print(f"Average {args.metric_name}: {avg_score}") diff --git a/evals/qa_context_provider_utils.py b/evals/qa_context_provider_utils.py new file mode 100644 index 000000000..52d63a4e3 --- /dev/null +++ b/evals/qa_context_provider_utils.py @@ -0,0 +1,61 @@ +import cognee +from cognee.api.v1.search import SearchType +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search +from cognee.tasks.completion.graph_query_completion import retrieved_edges_to_string + + +async def get_context_without_rag(instance: dict) -> str: + return instance["context"] + + +async def cognify_instance(instance: dict): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + for title, sentences in instance["context"]: + await cognee.add("\n".join(sentences), dataset_name="QA") + await cognee.cognify("QA") + + +async def get_context_with_cognee(instance: dict) -> str: + await cognify_instance(instance) + + search_results = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"]) + search_results_second = await cognee.search( + SearchType.SUMMARIES, query_text=instance["question"] + ) + search_results = search_results + search_results_second + + search_results_str = "\n".join([context_item["text"] for context_item in search_results]) + + return search_results_str + + +async def get_context_with_simple_rag(instance: dict) -> str: + await cognify_instance(instance) + + vector_engine = get_vector_engine() + found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5) + + search_results_str = "\n".join([context_item.payload["text"] for context_item in found_chunks]) + + return search_results_str + + +async def get_context_with_brute_force_triplet_search(instance: dict) -> str: + await cognify_instance(instance) + + found_triplets = await brute_force_triplet_search(instance["question"], top_k=5) + + search_results_str = retrieved_edges_to_string(found_triplets) + + return search_results_str + + +qa_context_providers = { + "no_rag": get_context_without_rag, + "cognee": get_context_with_cognee, + "simple_rag": get_context_with_simple_rag, + "brute_force": get_context_with_brute_force_triplet_search, +} From 9c10303e9e2244b00f3b324aed31c6c247835ba2 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 14 Jan 2025 18:31:37 +0100 Subject: [PATCH 15/15] Minor refactor and logger usage --- evals/eval_on_hotpot.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index cfc29dcc1..54dcaffd0 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -4,7 +4,7 @@ from deepeval.dataset import EvaluationDataset from deepeval.test_case import LLMTestCase from tqdm import tqdm - +import logging import cognee from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client @@ -12,6 +12,8 @@ from evals.qa_dataset_utils import load_qa_dataset from evals.qa_metrics_utils import get_metric +logger = logging.getLogger(__name__) + async def get_context_with_cognee(instance): await cognee.prune.prune_data() @@ -56,7 +58,7 @@ async def answer_qa_instance(instance, context_provider): return answer_prediction -async def eval_answers(instances, answers, eval_metric): +async def deepeval_answers(instances, answers, eval_metric): test_cases = [] for instance, answer in zip(instances, answers): @@ -71,23 +73,13 @@ async def eval_answers(instances, answers, eval_metric): return eval_results -async def eval_on_QA_dataset( - dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name -): - dataset = load_qa_dataset(dataset_name_or_filename) - - eval_metric = get_metric(eval_metric_name) - instances = dataset if not num_samples else dataset[:num_samples] - - if eval_metric_name.startswith("promptfoo"): - return await eval_metric.measure(instances, context_provider) - +async def deepeval_on_instances(instances, context_provider, eval_metric): answers = [] for instance in tqdm(instances, desc="Getting answers"): answer = await answer_qa_instance(instance, context_provider) answers.append(answer) - eval_results = await eval_answers(instances, answers, eval_metric) + eval_results = await deepeval_answers(instances, answers, eval_metric) avg_score = statistics.mean( [result.metrics_data[0].score for result in eval_results.test_results] ) @@ -95,6 +87,20 @@ async def eval_on_QA_dataset( return avg_score +async def eval_on_QA_dataset( + dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name +): + dataset = load_qa_dataset(dataset_name_or_filename) + + eval_metric = get_metric(eval_metric_name) + instances = dataset if not num_samples else dataset[:num_samples] + + if eval_metric_name.startswith("promptfoo"): + return await eval_metric.measure(instances, context_provider) + else: + return await deepeval_on_instances(instances, context_provider, eval_metric) + + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -113,4 +119,4 @@ async def eval_on_QA_dataset( avg_score = asyncio.run( eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name) ) - print(f"Average {args.metric_name}: {avg_score}") + logger.info(f"Average {args.metric_name}: {avg_score}")