-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_extractor.py
executable file
·112 lines (98 loc) · 3.9 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
"""
Extracts graph features for PCA and logistic regression.
For each user, we we generate a training example with:
label - expert or nonexpert
[0] Authority score at 10th percentile of active lifetime
[1] Authority score at 20th percentile of active lifetime
[2] Authority score at 30th percentile of active lifetime
[3] Authority score at 40th percentile of active lifetime
[4] Authority score at 50th percentile of active lifetime
[5] PageRank score at 10th percentile of active lifetime
[6] PageRank score at 20th percentile of active lifetime
[7] PageRank score at 30th percentile of active lifetime
[8] PageRank score at 40th percentile of active lifetime
[9] PageRank score at 50th percentile of active lifetime
[10] ELO score at 10th percentile of active lifetime
[11] ELO score at 20th percentile of active lifetime
[12] ELO score at 30th percentile of active lifetime
[13] ELO score at 40th percentile of active lifetime
[14] ELO score at 50th percentile of active lifetime
[15] CAU score at 10th percentile of active lifetime
[16] CAU score at 20th percentile of active lifetime
[17] CAU score at 30th percentile of active lifetime
[18] CAU score at 40th percentile of active lifetime
[19] CAU score at 50th percentile of active lifetime
Our goal is to use these evolution of their network
properties at early stages to predict where they will
be at the end of the dataset (expert or nonexpert).
Because it is difficult to make a relation between
experts as identified by StackExchange reputation
and experts as vetted by people in real life, for the
purpose of this project we will assume that reputation
is ground truth for expertise. We define experts to be
the top 10 percentile of users by StackExchange reputation.
"""
from random import shuffle
import sys
import metrics
import search_utilities
import ml
feature_percentiles = [.1, .2, .3, .4, .5]
def cau_scores(cur, conn, user_id):
"""
Returns a user CAU scores from 10th to 50th percentile.
"""
return metrics.cau_for_user(cur, conn, user_id, samples = feature_percentiles)
def elo_scores(cur, conn, user_id):
"""
Returns a user ELO scores from 10th to 50th percentile.
"""
return metrics.elo_for_user(cur, conn, user_id, samples = feature_percentiles)
def pagerank_scores(cur, user_id):
"""
Returns a user PageRank scores from 10th to 50th percentile.
"""
return metrics.pagerank_for_user(cur, user_id, samples = feature_percentiles)
def auth_scores(cur, user_id):
"""
Returns a user Auth scores from 10th to 50th percentile.
"""
return metrics.auth_for_user(cur, user_id, samples = feature_percentiles)
def is_expert(user_id):
"""
Returns true if the user is an expert.
"""
return user_id in search_utilities.get_experts()
def training_examples(cur, conn, user_ids):
"""
Returns a set of feature vectors and labels extracted
from the dataset for the given user_ids.
"""
fv = []
labels = []
counter = 0
for user_id in user_ids:
counter += 1
print "Training example building progress: %f" % (float(counter) / len(user_ids))
uv = []
#uv += auth_scores(cur, user_id)
#uv += pagerank_scores(cur, user_id)
uv += elo_scores(cur, conn, user_id)
uv += cau_scores(cur, conn, user_id)
fv.append(uv)
labels.append(1 if is_expert(user_id) else 0)
return fv, labels
def main(args):
conn, cur = metrics.connect("cooking", "Ben-han")
user_ids = search_utilities.get_experts() + search_utilities.get_nonexperts()
shuffle(user_ids)
data, labels = training_examples(cur, conn, user_ids)
train_data = data[:len(data)/2]
train_labels = labels[:len(labels)/2]
test_data = data[len(data)/2:]
test_labels = labels[len(labels)/2:]
ml.logistic_test(train_data, train_labels, train_data, train_labels, cv=True)
# ml.plot_pca(data, labels)
if __name__ == '__main__':
main(sys.argv)