-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_aff.py
142 lines (124 loc) · 4.45 KB
/
cluster_aff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import csv
import pdb
from operator import itemgetter
import heapq
import math
def sum_cluster_affinities(memberships, aff_filename):
"""
takes the user_id, cluster_id pairs from memberships
and sums the affinities for every subreddit that a user
of a given cluster has voted on
output form: clusters['cluster_id']['subreddit_id'] = sumed_affinity
"""
f_aff = open(aff_filename)
aff_reader = csv.reader(f_aff,delimiter='\t')
clusters = {}
for row in aff_reader:
#each for in aff file is a tuple, so need to get the parts
user_id = row[0]
sr_id = row[1]
affinity = float(row[2])
try:
c_id = int(memberships[user_id])
try:
#this will only work if the cluster_ID,sr_id exists
clusters[c_id][sr_id] += affinity
except KeyError:
#either that cluster id or the sr_id didn't exist,
#so we must initialize it
try:
clusters[c_id][sr_id] = affinity
except KeyError:
#must be the first time for the cluster_id
clusters[c_id] = { sr_id : affinity }
except KeyError:
#apparently not all UIDs in clabel are in the output of srrecs.r
#print user_id, "in affinities.clabel but not in srrecs.r output"
pass
f_aff.close()
return clusters
def check_recommendations(memberships, top_srs, test_data_file, threshold, medians):
"""
takes the top recommended subreddits (top_srs) and checks that
the members of the associated cluster have affinities above a
threshold for the recommended subreddits
"""
f_aff = open(test_data_file)
aff_reader = csv.reader(f_aff,delimiter='\t')
total_good = 0
total_bad = 0
totals = [0,0]
for row in aff_reader:
#each for in aff file is a tuple, so need to get the parts
user_id = row[0]
sr_id = row[1]
affinity = float(row[2])
try:
cluster = int(memberships[user_id])
if sr_id in top_srs[cluster]:
#if affinity >= threshold:
# total_good += 1
#else:
# total_bad += 1
try:
if affinity >= medians[sr_id]:
total_good += 1
else:
total_bad += 1
except KeyError:
print "That's weird, that subreddit wasn't found in the medians"
except KeyError:
#not all UIDs in affinities.dump appear in affinities.clabel
pass
totals[0] = total_good
totals[1] = total_bad
return totals
def generate_and_check_recommendations(memberships, test_data_file, threshold, clusters, n_top, test_data_size, medians):
"""
generates the top recommended subreddits (top_srs) and then calls
check_recommendations to check them. Also outputs the total good and
bad recommendations
"""
good = 0
bad = 0
largest = range(0,len(clusters)+1)
for i in range(1,len(clusters)+1):
temp_largest = heapq.nlargest(n_top, clusters[i].iteritems(),itemgetter(1))
largest[i] = {}
for item in temp_largest:
largest[i][item[0]] = item[1]
totals = check_recommendations(memberships, largest, test_data_file,threshold, medians)
good += totals[0]
bad += totals[1]
print "Total good recommendations:", good
print "Total bad recommendations:", bad
print "Total undefined:", test_data_size - good - bad
print "Percent good:", float( good*100.00 / (bad + good) )
print "Percent undefined:", (test_data_size - good - bad)*100.00/test_data_size
return [good , bad]
def generate_median_sr_affs(votes_file):
f = open(votes_file,'r')
reader = csv.reader(f, delimiter='\t')
srs = {}
for row in reader:
#each for in aff file is a tuple, so need to get the parts
user_id = row[0]
sr_id = row[1]
affinity = float(row[2])
if not(sr_id in srs):
srs[sr_id] = []
srs[sr_id].append(affinity)
else:
srs[sr_id].append(affinity)
medians = {}
temp_list = []
for sr_id in srs:
temp_list = sorted(srs[sr_id])
middle_index = int(math.floor(len(temp_list)/2))
if len(temp_list) % 2 == 0:
#there is an even number of elements
medians[sr_id] = (temp_list[middle_index]+temp_list[middle_index-1])/2
else:
#there is an odd number of elements
medians[sr_id] = temp_list[middle_index]
return medians