-
Notifications
You must be signed in to change notification settings - Fork 63
/
evaluation.py
116 lines (87 loc) · 3.4 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Evaluation code for multimodal ranking
Throughout, we assume 5 captions per image, and that
captions[5i:5i+5] are GT descriptions of images[i]
"""
import numpy
import datasets
from datasource import Datasource
import tools
def ranking_eval_5fold(model, split='dev'):
"""
Evaluate a trained model on either dev or test of the dataset it was trained on
Evaluate separately on 5 1000-image splits, and average the metrics
"""
data = model['options']['data']
cnn = model['options']['cnn']
results = []
for fold in range(5):
print 'Loading fold ' + str(fold)
dataset = datasets.load_dataset(data, cnn, load_train=False, fold=fold)
caps, ims = Datasource(dataset[split], model['worddict']).all()
print 'Computing results...'
c_emb = tools.encode_sentences(model, caps)
i_emb = tools.encode_images(model, ims)
errs = tools.compute_errors(model, c_emb, i_emb)
r = t2i(errs)
print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % tuple(r)
ri = i2t(errs)
print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % tuple(ri)
results.append(r + ri)
print("-----------------------------------")
print("Mean metrics: ")
mean_metrics = numpy.array(results).mean(axis=0).flatten()
print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % tuple(mean_metrics[:5])
print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % tuple(mean_metrics[5:])
def t2i(c2i, vis_details=False):
"""
Text->Images (Image Search)
c2i: (5N, N) matrix of caption to image errors
vis_details: if true, return a dictionary for ROC visualization purposes
"""
ranks = numpy.zeros(c2i.shape[0])
vis_dict = {'sentences': []}
for i in range(len(ranks)):
d_i = c2i[i]
inds = numpy.argsort(d_i)
rank = numpy.where(inds == i/5)[0][0]
ranks[i] = rank
def image_dict(k):
return {'id': k, 'score': float(d_i[k])}
if vis_details: # save top 10 images as well as GT image and their scores
vis_dict['sentences'].append({
'id': i,
'rank': rank + 1,
'gt_image': image_dict(i/5),
'top_images': map(image_dict, inds[:10])
})
# Compute metrics
r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
medr = numpy.floor(numpy.median(ranks)) + 1
meanr = ranks.mean() + 1
stats = map(float, [r1, r5, r10, medr, meanr])
if not vis_details:
return stats
else:
vis_dict['stats'] = {'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr}
return stats, vis_dict
def i2t(c2i):
"""
Text->Images (Image Search)
c2i: (5N, N) matrix of caption to image errors
"""
ranks = numpy.zeros(c2i.shape[1])
for i in range(len(ranks)):
d_i = c2i[:, i]
inds = numpy.argsort(d_i)
rank = numpy.where(inds/5 == i)[0][0]
ranks[i] = rank
# Compute metrics
r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
medr = numpy.floor(numpy.median(ranks)) + 1
meanr = ranks.mean() + 1
return map(float, [r1, r5, r10, medr, meanr])