-
Notifications
You must be signed in to change notification settings - Fork 51
/
utils.py
181 lines (131 loc) · 6.11 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""Utility functions for training and evaluation"""
# External dependencies
import pickle as pk
from sklearn import metrics
import numpy as np
import os.path
from gensim import matutils
from naga.shared.kb import KB
# Internal dependencies
from phrase2vec import Phrase2Vec
# Authorship
__author__ = "Ben Eisner, Tim Rocktaschel"
__email__ = "beisner@princeton.edu"
def generate_embeddings(ind2phr, kb, embeddings_file, word2vec_file, word2vec_dim=300):
"""Generate a numpy array of phrase embeddings for all phrases in the knowledge base.
Since it is expensive to calculate these phrase embeddings every time, we cache the output
in a file, which we can load from if this function is called on a set we've seen before.
Args:
word2vec_dim: Dimension of the static word2vec model we use
ind2phr: Mapping from phrase indices to phrases in the KB
kb: Knowledge base
embeddings_file: File where we store the embeddings
word2vec_file: word2vec model file
Returns:
"""
phrase_vector_sums = dict()
# get the complete word vectors from the second argument
if not (os.path.isfile(embeddings_file)):
print('reading embedding data from: ' + word2vec_file)
phrase_vec_model = Phrase2Vec.from_word2vec_paths(word2vec_dim, w2v_path=word2vec_file)
print('generating vector subset')
for phrase in kb.get_vocab(1):
phrase_vector_sums[phrase] = phrase_vec_model[phrase]
pk.dump(phrase_vector_sums, open(embeddings_file, 'wb'))
else:
print('loading embeddings...')
phrase_vector_sums = pk.load(open(embeddings_file, 'rb'))
# build the embeddings array, for lookup later
embeddings_array = np.zeros(shape=[len(ind2phr), 300], dtype=np.float32)
for ind, phr in ind2phr.items():
embeddings_array[ind] = phrase_vector_sums[phr]
return embeddings_array
# Read data from a file and inject it into a knowledge base
def __read_data(filename, base, ind_to_phr, ind_to_emoj, typ):
with open(filename, 'r') as f:
# build the data line by line
lines = f.readlines()
for line in lines:
ph, em, truth = line.rstrip().split('\t')
base.add((truth == 'True'), typ, em, ph)
ind_to_phr[base.get_id(ph, 1)] = ph
ind_to_emoj[base.get_id(em, 0)] = em
def build_kb(data_folder):
"""Read training data from the training directory and generate a KB
Args:
data_folder: Directory containing a train.txt, a dev.txt, and a test.txt from which
we can assemble our knowledge base.
"""
base = KB()
# KB indices to phrase
ind_to_phr = dict()
# KB indices to emoji
ind_to_emoj = dict()
__read_data(data_folder + 'train.txt', base, ind_to_phr, ind_to_emoj, 'train')
__read_data(data_folder + 'dev.txt', base, ind_to_phr, ind_to_emoj, 'dev')
__read_data(data_folder + 'test.txt', base, ind_to_phr, ind_to_emoj, 'test')
return base, ind_to_phr, ind_to_emoj
def get_examples_from_kb(kb, example_type='train'):
"""Extract all the examples of a type (i.e. train, dev, test) from the knowledge base
Args:
kb: Knowledge base
example_type: Name of example type (i.e. train, dev, test)
Returns:
Lists of the rows, columns, and targets from the dataset
"""
# prepare the training set
batch = list(kb.get_all_facts([example_type]))
rows = list()
cols = list()
targets = list()
for i in range(len(batch)):
example = batch[i]
cols.append(kb.get_id(example[0][0], 0))
rows.append(kb.get_id(example[0][1], 1))
targets.append(example[1])
return rows, cols, targets
def __sigmoid(x):
return 1 / (1 + np.math.exp(-x))
def generate_predictions(e2v, dset, phr_embeddings, ind2emoji, threshold):
"""Calculate whether a set of emoji/phrase pairs are correlated
This implementation doesn't use TensorFlow, and relies instead of injected vectors
Args:
e2v: Mapping from emoji to vector, typically the trained emoji vectors from our model.
dset: KB that contains pairs of emoji and phrases, as well as whether they are correlated.
phr_embeddings: Map between phrase indices and phrase vectors, as computed by the vector sum of
word vectors for that phrase.
ind2emoji: Map between emoji index and emoji, for converting dset indices into emoji.
threshold: Threshold for classifying correlation as true or false.
Returns:
y_pred_labels: List of predicted labels for pairs in the dataset
y_pred_values: List of predicted scores for pairs in the dataset
y_true_labels: List of true labels for pairs in the dataset
y_true_values: List of true scores for pairs in the dataset
"""
y_pred_labels = list()
y_pred_values = list()
phr_ixs, em_ixs, truths = dset
for (phr_ix, em_ix, truth) in zip(phr_ixs, em_ixs, truths):
prob = __sigmoid(
np.dot(matutils.unitvec(phr_embeddings[phr_ix]), matutils.unitvec(e2v[ind2emoji[em_ix]])))
y_pred_values.append(prob)
y_pred_labels.append(prob >= threshold) # Threshold predicted probability
y_true_values = [float(v) for v in truths]
return y_pred_labels, y_pred_values, truths, y_true_values
def get_metrics(pred_labels, pred_values, truth_labels, truth_values):
"""Get a set of metrics, including accuracy, f1 score, and area under the curve.
This method takes in predictions and spits out performance metrics.
Args:
pred_labels: Predicted labels for correlation between an emoji and a phrase.
pred_values: Predicted correlation value between an emoji and a phrase.
truth_labels: True labels for correlation between an emoji and a phrase.
truth_values: True correlation value between an emoji and a phrase
Returns:
"""
acc = metrics.accuracy_score(y_true=truth_labels, y_pred=pred_labels)
f1 = metrics.f1_score(y_true=truth_labels, y_pred=pred_labels)
try:
auc = metrics.roc_auc_score(y_true=truth_values, y_score=pred_values)
except:
auc = 'N/A'
return acc, f1, auc