-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusterEvaluation.py
81 lines (71 loc) · 3.07 KB
/
clusterEvaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sys
import os
import re
from collections import defaultdict
def get_author_accuracy(cluster):
max_count = 0
max_author = None
total_count = 0
cluster_counts = defaultdict(int)
for author, count in cluster.items():
total_count += count
cluster_counts[author] = count
if count > max_count:
max_author = author
max_count = count
return max_author, max_count, total_count, cluster_counts
def main():
cluster_author_counts = dict()
author_hits_misses = dict()
cluster_num = None
accuracies = []
with open("clusterOutput.txt", "r") as input:
for line in input:
if "Cluster" in line:
if cluster_num:
max_author, max_count, total_count, author_counts = \
get_author_accuracy(cluster_author_counts[cluster_num])
accuracy = max_count / total_count
accuracies.append(accuracy)
for author, count in author_counts.items():
current_author_hit_misses = author_hits_misses[author] if author in author_hits_misses else (0, 0)
if author == max_author:
author_hits_misses[author] = (current_author_hit_misses[0] + author_counts[author], current_author_hit_misses[1])
else:
author_hits_misses[author] = (current_author_hit_misses[0], current_author_hit_misses[1] + author_counts[author])
print("Cluster " + str(cluster_num))
print("Author Prediction: " + max_author)
print("Accuracy: " + str(max_count) + "/" + str(total_count) + " " + str(accuracy * 100) + "%")
print()
cluster_num = int(re.split(" |:", line)[1])
cluster_author_counts[cluster_num] = defaultdict(int)
else:
[author, article] = line.split('/')[-2:]
cluster_author_counts[cluster_num][author] += 1
if cluster_num:
max_author, max_count, total_count, author_counts = \
get_author_accuracy(cluster_author_counts[cluster_num])
accuracy = max_count / total_count
accuracies.append(accuracy)
for author, count in author_counts.items():
current_author_hit_misses = author_hits_misses[author]
if author == max_author:
author_hits_misses[author] = current_author_hit_misses + (author_counts[author], 0)
else:
author_hits_misses[author] = current_author_hit_misses + (0, author_counts[author])
print("Cluster " + str(cluster_num))
print("Author Prediction: " + max_author)
print("Accuracy: " + str(max_count) + "/" + str(total_count) + " " + str(accuracy * 100) + "%")
print()
else:
[author, article] = line.split('/')[-2:]
cluster_author_counts[cluster_num][author] += 1
print("Author accuracies:")
total = 0
for author, hit_misses in author_hits_misses.items():
total += ((hit_misses[0]) / (hit_misses[0] + hit_misses[1]))
print(author + ": " + str(((hit_misses[0]) / (hit_misses[0] + hit_misses[1])) * 100) + "%")
print("\nAverage Author Accuracy: " + str(total / len(author_hits_misses)))
print("Total Accuracy: " + str(sum(accuracies) / len(accuracies) * 100) + "%")
if __name__ == "__main__":
main()