-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.py
165 lines (133 loc) · 6.83 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gensim
import re
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = [] # will hold all docs in original order
with open('gutenberg.txt') as alldata:
for line_no, line in enumerate(alldata):
tokens = gensim.utils.to_unicode(line).split()
words = tokens[0:]
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
split = ['train','test','extra','extra'][line_no//30000] # 25k train, 25k test, 25k extra
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//10000] # [12.5K pos, 12.5K neg]*2 then unknown
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:] # for reshuffling per pass
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"
simple_models = [
# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
# Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
# PV-DBOW
Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=2, workers=cores),
# PV-DM w/average
# Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]
# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs) # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
model.reset_from(simple_models[0])
print(model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
import numpy as np
import statsmodels.api as sm
from random import sample
# for timing
from contextlib import contextmanager
from timeit import default_timer
import time
@contextmanager
def elapsed_timer():
start = default_timer()
elapser = lambda: default_timer() - start
yield lambda: elapser()
end = default_timer()
elapser = lambda: end-start
def logistic_predictor_from_data(train_targets, train_regressors):
logit = sm.Logit(train_targets, train_regressors)
predictor = logit.fit(disp=0)
#print(predictor.summary())
return predictor
def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
"""Report error rate on test_doc sentiments, using supplied model and train_docs"""
train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
train_regressors = sm.add_constant(train_regressors)
predictor = logistic_predictor_from_data(train_targets, train_regressors)
test_data = test_set
if infer:
if infer_subsample < 1.0:
test_data = sample(test_data, int(infer_subsample * len(test_data)))
test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
else:
test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
test_regressors = sm.add_constant(test_regressors)
# predict & evaluate
test_predictions = predictor.predict(test_regressors)
corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
errors = len(test_predictions) - corrects
error_rate = float(errors) / len(test_predictions)
return (error_rate, errors, len(test_predictions), predictor)
from collections import defaultdict
best_error = defaultdict(lambda :1.0) # to selectively-print only best errors achieved
from random import shuffle
import datetime
alpha, min_alpha, passes = (0.025, 0.001, 100)
alpha_delta = (alpha - min_alpha) / passes
print("START %s" % datetime.datetime.now())
for epoch in range(passes):
shuffle(doc_list) # shuffling gets best results
for name, train_model in models_by_name.items():
# train
duration = 'na'
train_model.alpha, train_model.min_alpha = alpha, alpha
with elapsed_timer() as elapsed:
train_model.train(doc_list)
duration = '%.1f' % elapsed()
# evaluate
eval_duration = ''
with elapsed_timer() as eval_elapsed:
err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if err <= best_error[name]:
best_error[name] = err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))
if ((epoch + 1) % 5) == 0 or epoch == 0:
eval_duration = ''
with elapsed_timer() as eval_elapsed:
infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if infer_err < best_error[name + '_inferred']:
best_error[name + '_inferred'] = infer_err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))
print('completed pass %i at alpha %f' % (epoch + 1, alpha))
alpha -= alpha_delta
print("END %s" % str(datetime.datetime.now()))
# print best error rates achieved
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
print("%f %s" % (rate, name))
doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
inferred_docvec = model.infer_vector(alldocs[doc_id].words)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
model.save(re.sub('[^0-9a-zA-Z]+', '', str(model)) + '-breaks.doc2vec');
import random
model = Doc2Vec.load('gutenberg.doc2vec');
doc_id = np.random.randint(model.docvecs.count) # pick random doc, re-run cell for more examples
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count) # get *all* similar documents
print(u'TARGET (%d): %s\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
print(u'%s %s: %s\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))