Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluate generation #4

Merged
merged 4 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ scripts/rag_evaluation/data/flat_files/
scripts/rag_evaluation/evaluate_generation/generation_speed/data/*.csv
scripts/rag_evaluation/evaluate_retrieval/data/output/*csv

*.jsonl
/models
/data/
!/evaluation/data/
Expand Down
106 changes: 85 additions & 21 deletions evaluation/core/evaluators/generation/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@
https://docs.llamaindex.ai/en/stable/examples/low_level/evaluation/#evaluating-generation
"""

import csv
import json
import logging
import os
from tqdm import tqdm
import pandas as pd

from evaluation.core.openai import get_chat_completion
from evaluation.core.openai.openai import get_chat_completion


CORRECTNESS_SYS_TMPL = """
Expand Down Expand Up @@ -63,10 +68,11 @@


class CorrectnessEvaluator:
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", threshold=4.0):
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", threshold=4.0, max_tokens=300):
self.openai_api_key = openai_api_key
self.model = model
self.threshold = threshold
self.max_tokens = max_tokens

def run_correctness_eval(self, query_str: str, reference_answer: str, generated_answer: str):
"""
Expand All @@ -80,34 +86,92 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_
Returns:
- dict, containing whether the answer passes the threshold, the score, and reasoning.
"""
user_prompt = CORRECTNESS_USER_TMPL.format(query_str, reference_answer, generated_answer)
system_prompt = CORRECTNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key, user_prompt, system_prompt, model=self.model)
score = open_ai_response["score"]
reasoning = open_ai_response["reasoning"]

return {"passing": score >= self.threshold, "score": score, "reason": reasoning}

def run_batch_evaluation(self, df: pd.DataFrame):
try:
user_prompt = CORRECTNESS_USER_TMPL.format(
query=query_str,
reference_answer=reference_answer,
generated_answer=generated_answer)

system_prompt = CORRECTNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key,
user_prompt,
system_prompt,
ANSWER_JSON_SCHEMA,
model=self.model,
max_tokens=self.max_tokens)
json_answer = json.loads(open_ai_response.get("choices")[
0].get("message").get("content"))

score = json_answer["score"]
reasoning = json_answer["reasoning"]

return {"score": score, "reasoning": reasoning, "passing": score >= self.threshold, }

except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON response: {e}")
return {"score": None, "passing": None, "reasoning": "Invalid JSON response"}

except KeyError as e:
logging.error(f"Missing key in JSON response: {e}")
return {"score": None, "passing": None, "reasoning": "Incomplete JSON response"}

except Exception as e:
logging.error(f"An error occurred: {e}")
return {"score": None, "passing": None, "reasoning": "An unexpected error occurred"}

def run_batch_evaluation(self,
df: pd.DataFrame,
output_file: str,
query_column: str,
reference_answer_column: str,
generated_answer_column: str,
resource_id_column: str
):
"""
Runs correctness evaluation on a batch of queries, reference answers, and generated answers.
Saves results incrementally to avoid data loss in case of failure.

Parameters:
- df: pd.DataFrame, a dataframe with columns 'query', 'reference_answer', and 'generated_answer'.
- output_file: str, the path to the output CSV file where results will be saved.

Returns:
- pd.DataFrame, the original dataframe with additional columns for score, reasoning, and passing status.
"""
results = []
for _, row in df.iterrows():
result = self.run_correctness_eval(row["query"], row["reference_answer"], row["generated_answer"])
results.append(result)

# Convert list of dicts to a DataFrame
results_df = pd.DataFrame(results)
# Determine if the file already exists
file_exists = os.path.isfile(output_file)

with open(output_file, mode='a', newline='') as file:
writer = csv.DictWriter(
file, fieldnames=[resource_id_column, 'score', 'reasoning', 'passing'])

# Write header only if the file does not exist
if not file_exists:
writer.writeheader()

try:
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing correctness"):
result = self.run_correctness_eval(
row[query_column],
row[reference_answer_column],
row[generated_answer_column])
result[resource_id_column] = row[resource_id_column]
# Write the result to the CSV file
writer.writerow(result)

# Ensure the data is written to disk
file.flush()

except Exception as e:
print(f"Error encountered: {e}. Saving progress and exiting.")
raise

# Load the results back into a DataFrame and concatenate with the original
results_df = pd.read_csv(output_file)

# Concatenate the original dataframe with the results
df = pd.concat([df, results_df], axis=1)
correctnes_mean_score = round(results_df["score"].sum(
) / (len(results_df) * 5), 2)

return df
return correctnes_mean_score
128 changes: 97 additions & 31 deletions evaluation/core/evaluators/generation/faithfullness.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
https://docs.llamaindex.ai/en/stable/examples/low_level/evaluation/#evaluating-generation
"""

import csv
import json
import logging
import os
import pandas as pd
from evaluation.core.openai import get_chat_completion

from tqdm import tqdm

from evaluation.core.openai.openai import get_chat_completion


FAITHFULLNESS_SYS_TMPL = """
Expand Down Expand Up @@ -66,9 +73,10 @@


class FaithfulnessEvaluator:
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06"):
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", max_tokens=300):
self.openai_api_key = openai_api_key
self.model = model
self.max_tokens = max_tokens

def run_faithfulness_eval(self, generated_answer: str, contexts: str):
"""
Expand All @@ -81,41 +89,99 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str):
Returns:
- dict, containing evaluations on relevancy, accuracy, conciseness and pertinence, and reasoning.
"""
user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer, contexts=contexts)
system_prompt = FAITHFULLNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key, user_prompt, system_prompt, model=self.model)
relevancy = 1 if open_ai_response["relevancy"] == "YES" else 0
accuracy = 1 if open_ai_response["accuracy"] == "YES" else 0
conciseness_and_pertinence = 1 if open_ai_response["conciseness_and_pertinence"] == "YES" else 0
reasoning = open_ai_response["reasoning"]

return {
"relevancy": relevancy,
"accuracy": accuracy,
"conciseness_and_pertinence": conciseness_and_pertinence,
"reasoning": reasoning,
}

def run_batch_evaluation(self, df: pd.DataFrame):
try:
user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer,
contexts=contexts)
system_prompt = FAITHFULLNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key,
user_prompt,
system_prompt,
ANSWER_JSON_SCHEMA,
model=self.model,
max_tokens=self.max_tokens)

json_answer = json.loads(open_ai_response.get("choices")[
0].get("message").get("content"))

relevancy = 1 if json_answer["relevancy"] == "YES" else 0
accuracy = 1 if json_answer["accuracy"] == "YES" else 0
conciseness_and_pertinence = 1 if json_answer[
"conciseness_and_pertinence"] == "YES" else 0
reasoning = json_answer["reasoning"]

return {
"relevancy": relevancy,
"accuracy": accuracy,
"conciseness_and_pertinence": conciseness_and_pertinence,
"reasoning": reasoning,
}

except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON response: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Invalid JSON response"}

except KeyError as e:
logging.error(f"Missing key in JSON response: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Incomplete JSON response"}

except Exception as e:
logging.error(f"An error occurred: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "An unexpected error occurred"}

def run_batch_evaluation(self,
df: pd.DataFrame,
output_file: str,
generated_answer_column: str,
contexts_column: str,
resource_id_column: str):
"""
Runs faithfulness evaluation on a batch of generated answers and contexts.
Saves results incrementally to avoid data loss in case of failure.

Parameters:
- df: pd.DataFrame, a dataframe with columns 'generated_answer' and 'contexts'.
- output_file: str, the path to the output CSV file where results will be saved.

Returns:
- pd.DataFrame, the original dataframe with additional columns for relevancy, accuracy, conciseness and pertinence, and reasoning.
"""
results = []
for _, row in df.iterrows():
result = self.run_faithfulness_eval(row["generated_answer"], row["contexts"])
results.append(result)

# Convert list of dicts to a DataFrame
results_df = pd.DataFrame(results)

# Concatenate the original dataframe with the results
df = pd.concat([df, results_df], axis=1)

return df
# Determine if the file already exists
file_exists = os.path.isfile(output_file)

with open(output_file, mode='a', newline='') as file:
writer = csv.DictWriter(file, fieldnames=[
resource_id_column, 'relevancy', 'accuracy', 'conciseness_and_pertinence', 'reasoning'])

# Write header only if the file does not exist
if not file_exists:
writer.writeheader()

try:
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing faithfulness"):
result = self.run_faithfulness_eval(
row[generated_answer_column],
row[contexts_column])
result[resource_id_column] = row[resource_id_column]
# Write the result to the CSV file
writer.writerow(result)

# Ensure the data is written to disk
file.flush()

except Exception as e:
print(f"Error encountered: {e}. Saving progress and exiting.")
raise

# Load the results back into a DataFrame and concatenate with the original
results_df = pd.read_csv(output_file)

total_questions = len(results_df)
faithfulness_relevancy = round(results_df["relevancy"].sum(
) / total_questions, 2)
faithfulness_accuracy = round(
results_df["accuracy"].sum() / total_questions, 2)
faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum(
) / total_questions, 2)

return faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence
Loading