Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support 4 different rag options in eval #439

Merged
merged 29 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a67512d
QA eval dataset as argument, with hotpot and 2wikimultihop as options…
alekszievr Jan 8, 2025
ddf1bb8
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 9, 2025
e0a8c19
Load dataset file by filename, outsource utilities
alekszievr Jan 9, 2025
0186128
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 9, 2025
92f2e94
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 10, 2025
1018292
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 11, 2025
49fb053
restructure metric selection
alekszievr Jan 11, 2025
13422ba
Add comprehensiveness, diversity and empowerment metrics
alekszievr Jan 11, 2025
0ead7d1
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 13, 2025
d57609d
add promptfoo as an option
alekszievr Jan 13, 2025
192ada3
Merge branch 'dev' into feat/COG-950-improve-metric-selection
alekszievr Jan 13, 2025
8eedc2b
refactor RAG solution in eval;2C
alekszievr Jan 13, 2025
079c16c
LLM as a judge metrics implemented in a uniform way
alekszievr Jan 13, 2025
782c352
Merge branch 'dev' into feat/COG-950-improve-metric-selection
alekszievr Jan 13, 2025
51d56e1
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 13, 2025
cefe7d8
Merge branch 'dev' into feat/COG-946-abstract-eval-dataset
alekszievr Jan 14, 2025
273b16c
Use requests.get instead of wget
alekszievr Jan 14, 2025
48627b2
Merge branch 'dev' into feat/COG-950-improve-metric-selection
alekszievr Jan 14, 2025
66d8850
clean up promptfoo config template
alekszievr Jan 14, 2025
e414516
minor fixes
alekszievr Jan 14, 2025
f28e208
Merge branch 'feat/COG-946-abstract-eval-dataset' into feat/COG-950-i…
alekszievr Jan 14, 2025
c95dbb8
get promptfoo path instead of hardcoding
alekszievr Jan 14, 2025
14cac1b
minor fixes
alekszievr Jan 14, 2025
51d4607
Add LLM as a judge prompts
alekszievr Jan 14, 2025
d20ecd0
Support 4 different rag options in eval
alekszievr Jan 14, 2025
9131df2
Merge branch 'dev' into feat/COG-950-improve-metric-selection
alekszievr Jan 14, 2025
9c10303
Minor refactor and logger usage
alekszievr Jan 14, 2025
5aa0f05
Minor cleanup and renaming
alekszievr Jan 14, 2025
3921b48
Merge branch 'dev' into feat/cog-954-rag-choice
alekszievr Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 11 additions & 34 deletions evals/eval_on_hotpot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,15 @@
from deepeval.test_case import LLMTestCase
from tqdm import tqdm
import logging
import cognee
from cognee.api.v1.search import SearchType
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
from evals.qa_dataset_utils import load_qa_dataset
from evals.qa_metrics_utils import get_metric
from evals.qa_context_provider_utils import qa_context_providers
alekszievr marked this conversation as resolved.
Show resolved Hide resolved

logger = logging.getLogger(__name__)


async def get_context_with_cognee(instance):
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

for title, sentences in instance["context"]:
await cognee.add("\n".join(sentences), dataset_name="QA")
await cognee.cognify("QA")

search_results = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
search_results_second = await cognee.search(
SearchType.SUMMARIES, query_text=instance["question"]
)
search_results = search_results + search_results_second

search_results_str = "\n".join([context_item["text"] for context_item in search_results])

return search_results_str


async def get_context_without_cognee(instance):
return instance["context"]


async def answer_qa_instance(instance, context_provider):
context = await context_provider(instance)

Expand Down Expand Up @@ -88,10 +64,10 @@ async def deepeval_on_instances(instances, context_provider, eval_metric):


async def eval_on_QA_dataset(
dataset_name_or_filename: str, context_provider, num_samples, eval_metric_name
dataset_name_or_filename: str, context_provider_name, num_samples, eval_metric_name
):
dataset = load_qa_dataset(dataset_name_or_filename)

context_provider = qa_context_providers[context_provider_name]
eval_metric = get_metric(eval_metric_name)
instances = dataset if not num_samples else dataset[:num_samples]

Expand All @@ -105,18 +81,19 @@ async def eval_on_QA_dataset(
parser = argparse.ArgumentParser()

parser.add_argument("--dataset", type=str, required=True, help="Which dataset to evaluate on")
parser.add_argument("--with_cognee", action="store_true")
parser.add_argument(
"--rag_option",
type=str,
choices=qa_context_providers.keys(),
required=True,
help="RAG option to use for providing context",
)
parser.add_argument("--num_samples", type=int, default=500)
parser.add_argument("--metric_name", type=str, default="Correctness")

args = parser.parse_args()

if args.with_cognee:
context_provider = get_context_with_cognee
else:
context_provider = get_context_without_cognee

avg_score = asyncio.run(
eval_on_QA_dataset(args.dataset, context_provider, args.num_samples, args.metric_name)
eval_on_QA_dataset(args.dataset, args.rag_option, args.num_samples, args.metric_name)
)
logger.info(f"Average {args.metric_name}: {avg_score}")
59 changes: 59 additions & 0 deletions evals/qa_context_provider_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import cognee
from cognee.api.v1.search import SearchType
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
from cognee.tasks.completion.graph_query_completion import retrieved_edges_to_string


async def get_raw_context(instance: dict) -> str:
return instance["context"]


async def cognify_instance(instance: dict):
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

for title, sentences in instance["context"]:
await cognee.add("\n".join(sentences), dataset_name="QA")
await cognee.cognify("QA")
alekszievr marked this conversation as resolved.
Show resolved Hide resolved


async def get_context_with_cognee(instance: dict) -> str:
await cognify_instance(instance)

insights = await cognee.search(SearchType.INSIGHTS, query_text=instance["question"])
summaries = await cognee.search(SearchType.SUMMARIES, query_text=instance["question"])
search_results = insights + summaries

search_results_str = "\n".join([context_item["text"] for context_item in search_results])

return search_results_str


async def get_context_with_simple_rag(instance: dict) -> str:
await cognify_instance(instance)

vector_engine = get_vector_engine()
found_chunks = await vector_engine.search("document_chunk_text", instance["question"], limit=5)

search_results_str = "\n".join([context_item.payload["text"] for context_item in found_chunks])

return search_results_str


async def get_context_with_brute_force_triplet_search(instance: dict) -> str:
await cognify_instance(instance)

found_triplets = await brute_force_triplet_search(instance["question"], top_k=5)

search_results_str = retrieved_edges_to_string(found_triplets)

return search_results_str


qa_context_providers = {
"no_rag": get_raw_context,
"cognee": get_context_with_cognee,
"simple_rag": get_context_with_simple_rag,
"brute_force": get_context_with_brute_force_triplet_search,
}
Loading