Skip to content

Commit

Permalink
Merge pull request #4 from fastenhealth/evaluate_generation
Browse files Browse the repository at this point in the history
Evaluate generation
  • Loading branch information
dgbaenar authored Sep 3, 2024
2 parents e4b518b + e7d78ec commit 6ed90b3
Show file tree
Hide file tree
Showing 19 changed files with 2,001 additions and 60 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ scripts/rag_evaluation/data/flat_files/
scripts/rag_evaluation/evaluate_generation/generation_speed/data/*.csv
scripts/rag_evaluation/evaluate_retrieval/data/output/*csv

*.jsonl
/models
/data/
!/evaluation/data/
Expand Down
106 changes: 85 additions & 21 deletions evaluation/core/evaluators/generation/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@
https://docs.llamaindex.ai/en/stable/examples/low_level/evaluation/#evaluating-generation
"""

import csv
import json
import logging
import os
from tqdm import tqdm
import pandas as pd

from evaluation.core.openai import get_chat_completion
from evaluation.core.openai.openai import get_chat_completion


CORRECTNESS_SYS_TMPL = """
Expand Down Expand Up @@ -63,10 +68,11 @@


class CorrectnessEvaluator:
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", threshold=4.0):
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", threshold=4.0, max_tokens=300):
self.openai_api_key = openai_api_key
self.model = model
self.threshold = threshold
self.max_tokens = max_tokens

def run_correctness_eval(self, query_str: str, reference_answer: str, generated_answer: str):
"""
Expand All @@ -80,34 +86,92 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_
Returns:
- dict, containing whether the answer passes the threshold, the score, and reasoning.
"""
user_prompt = CORRECTNESS_USER_TMPL.format(query_str, reference_answer, generated_answer)
system_prompt = CORRECTNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key, user_prompt, system_prompt, model=self.model)
score = open_ai_response["score"]
reasoning = open_ai_response["reasoning"]

return {"passing": score >= self.threshold, "score": score, "reason": reasoning}

def run_batch_evaluation(self, df: pd.DataFrame):
try:
user_prompt = CORRECTNESS_USER_TMPL.format(
query=query_str,
reference_answer=reference_answer,
generated_answer=generated_answer)

system_prompt = CORRECTNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key,
user_prompt,
system_prompt,
ANSWER_JSON_SCHEMA,
model=self.model,
max_tokens=self.max_tokens)
json_answer = json.loads(open_ai_response.get("choices")[
0].get("message").get("content"))

score = json_answer["score"]
reasoning = json_answer["reasoning"]

return {"score": score, "reasoning": reasoning, "passing": score >= self.threshold, }

except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON response: {e}")
return {"score": None, "passing": None, "reasoning": "Invalid JSON response"}

except KeyError as e:
logging.error(f"Missing key in JSON response: {e}")
return {"score": None, "passing": None, "reasoning": "Incomplete JSON response"}

except Exception as e:
logging.error(f"An error occurred: {e}")
return {"score": None, "passing": None, "reasoning": "An unexpected error occurred"}

def run_batch_evaluation(self,
df: pd.DataFrame,
output_file: str,
query_column: str,
reference_answer_column: str,
generated_answer_column: str,
resource_id_column: str
):
"""
Runs correctness evaluation on a batch of queries, reference answers, and generated answers.
Saves results incrementally to avoid data loss in case of failure.
Parameters:
- df: pd.DataFrame, a dataframe with columns 'query', 'reference_answer', and 'generated_answer'.
- output_file: str, the path to the output CSV file where results will be saved.
Returns:
- pd.DataFrame, the original dataframe with additional columns for score, reasoning, and passing status.
"""
results = []
for _, row in df.iterrows():
result = self.run_correctness_eval(row["query"], row["reference_answer"], row["generated_answer"])
results.append(result)

# Convert list of dicts to a DataFrame
results_df = pd.DataFrame(results)
# Determine if the file already exists
file_exists = os.path.isfile(output_file)

with open(output_file, mode='a', newline='') as file:
writer = csv.DictWriter(
file, fieldnames=[resource_id_column, 'score', 'reasoning', 'passing'])

# Write header only if the file does not exist
if not file_exists:
writer.writeheader()

try:
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing correctness"):
result = self.run_correctness_eval(
row[query_column],
row[reference_answer_column],
row[generated_answer_column])
result[resource_id_column] = row[resource_id_column]
# Write the result to the CSV file
writer.writerow(result)

# Ensure the data is written to disk
file.flush()

except Exception as e:
print(f"Error encountered: {e}. Saving progress and exiting.")
raise

# Load the results back into a DataFrame and concatenate with the original
results_df = pd.read_csv(output_file)

# Concatenate the original dataframe with the results
df = pd.concat([df, results_df], axis=1)
correctnes_mean_score = round(results_df["score"].sum(
) / (len(results_df) * 5), 2)

return df
return correctnes_mean_score
128 changes: 97 additions & 31 deletions evaluation/core/evaluators/generation/faithfullness.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
https://docs.llamaindex.ai/en/stable/examples/low_level/evaluation/#evaluating-generation
"""

import csv
import json
import logging
import os
import pandas as pd
from evaluation.core.openai import get_chat_completion

from tqdm import tqdm

from evaluation.core.openai.openai import get_chat_completion


FAITHFULLNESS_SYS_TMPL = """
Expand Down Expand Up @@ -66,9 +73,10 @@


class FaithfulnessEvaluator:
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06"):
def __init__(self, openai_api_key, model="gpt-4o-2024-08-06", max_tokens=300):
self.openai_api_key = openai_api_key
self.model = model
self.max_tokens = max_tokens

def run_faithfulness_eval(self, generated_answer: str, contexts: str):
"""
Expand All @@ -81,41 +89,99 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str):
Returns:
- dict, containing evaluations on relevancy, accuracy, conciseness and pertinence, and reasoning.
"""
user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer, contexts=contexts)
system_prompt = FAITHFULLNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key, user_prompt, system_prompt, model=self.model)
relevancy = 1 if open_ai_response["relevancy"] == "YES" else 0
accuracy = 1 if open_ai_response["accuracy"] == "YES" else 0
conciseness_and_pertinence = 1 if open_ai_response["conciseness_and_pertinence"] == "YES" else 0
reasoning = open_ai_response["reasoning"]

return {
"relevancy": relevancy,
"accuracy": accuracy,
"conciseness_and_pertinence": conciseness_and_pertinence,
"reasoning": reasoning,
}

def run_batch_evaluation(self, df: pd.DataFrame):
try:
user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer,
contexts=contexts)
system_prompt = FAITHFULLNESS_SYS_TMPL

open_ai_response = get_chat_completion(self.openai_api_key,
user_prompt,
system_prompt,
ANSWER_JSON_SCHEMA,
model=self.model,
max_tokens=self.max_tokens)

json_answer = json.loads(open_ai_response.get("choices")[
0].get("message").get("content"))

relevancy = 1 if json_answer["relevancy"] == "YES" else 0
accuracy = 1 if json_answer["accuracy"] == "YES" else 0
conciseness_and_pertinence = 1 if json_answer[
"conciseness_and_pertinence"] == "YES" else 0
reasoning = json_answer["reasoning"]

return {
"relevancy": relevancy,
"accuracy": accuracy,
"conciseness_and_pertinence": conciseness_and_pertinence,
"reasoning": reasoning,
}

except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON response: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Invalid JSON response"}

except KeyError as e:
logging.error(f"Missing key in JSON response: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Incomplete JSON response"}

except Exception as e:
logging.error(f"An error occurred: {e}")
return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "An unexpected error occurred"}

def run_batch_evaluation(self,
df: pd.DataFrame,
output_file: str,
generated_answer_column: str,
contexts_column: str,
resource_id_column: str):
"""
Runs faithfulness evaluation on a batch of generated answers and contexts.
Saves results incrementally to avoid data loss in case of failure.
Parameters:
- df: pd.DataFrame, a dataframe with columns 'generated_answer' and 'contexts'.
- output_file: str, the path to the output CSV file where results will be saved.
Returns:
- pd.DataFrame, the original dataframe with additional columns for relevancy, accuracy, conciseness and pertinence, and reasoning.
"""
results = []
for _, row in df.iterrows():
result = self.run_faithfulness_eval(row["generated_answer"], row["contexts"])
results.append(result)

# Convert list of dicts to a DataFrame
results_df = pd.DataFrame(results)

# Concatenate the original dataframe with the results
df = pd.concat([df, results_df], axis=1)

return df
# Determine if the file already exists
file_exists = os.path.isfile(output_file)

with open(output_file, mode='a', newline='') as file:
writer = csv.DictWriter(file, fieldnames=[
resource_id_column, 'relevancy', 'accuracy', 'conciseness_and_pertinence', 'reasoning'])

# Write header only if the file does not exist
if not file_exists:
writer.writeheader()

try:
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing faithfulness"):
result = self.run_faithfulness_eval(
row[generated_answer_column],
row[contexts_column])
result[resource_id_column] = row[resource_id_column]
# Write the result to the CSV file
writer.writerow(result)

# Ensure the data is written to disk
file.flush()

except Exception as e:
print(f"Error encountered: {e}. Saving progress and exiting.")
raise

# Load the results back into a DataFrame and concatenate with the original
results_df = pd.read_csv(output_file)

total_questions = len(results_df)
faithfulness_relevancy = round(results_df["relevancy"].sum(
) / total_questions, 2)
faithfulness_accuracy = round(
results_df["accuracy"].sum() / total_questions, 2)
faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum(
) / total_questions, 2)

return faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence
Loading

0 comments on commit 6ed90b3

Please sign in to comment.