-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluate_faq.py
46 lines (33 loc) · 1.55 KB
/
evaluate_faq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("KBLab/sentence-bert-swedish-cased")
df = pd.read_csv("faq_dev.tsv", sep="\t")
df_list = []
for category, group in df.groupby("category_id"):
question_embeddings = model.encode(group["question"].tolist(), normalize_embeddings=True)
candidate_embeddings = model.encode(
group["correct_answer"].tolist(), normalize_embeddings=True
)
similarities = question_embeddings @ candidate_embeddings.T
# Choose candidate answer with highest similarity to given question as prediction
group["prediction"] = (
group["correct_answer"]
.reset_index(drop=True)
.reindex(np.argmax(similarities, axis=1).tolist())
.tolist()
)
df_list.append(group)
df_pred = pd.concat(df_list)[["category_id", "source", "question", "correct_answer", "prediction"]]
# Accuracy per source (Försäkringskassan, Skatteverket, etc...)
for source, group in df_pred.groupby("source"):
accuracy = sum(group["correct_answer"] == group["prediction"]) / len(group)
print(f"{source}: {accuracy}")
# Total accuracy
print(f"Total accuracy: {sum(df_pred['correct_answer'] == df_pred['prediction']) / len(df_pred)}")
group_lengths = []
for category, group in df_pred.groupby("category_id"):
group_lengths.append(len(group))
# Expected accuracy if guessing same candidate answer for all questions within a category
expected_acc = 1 / np.mean(group_lengths)
print(f"Expected naive guess accuracy: {expected_acc:.4f}")