forked from YahooArchive/crow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
268 lines (207 loc) · 8.69 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# Copyright 2015, Yahoo Inc.
# Licensed under the terms of the Apache License, Version 2.0. See the LICENSE file associated with the project for terms.
import sys
import os
import glob
from functools import partial
from tempfile import NamedTemporaryFile
import numpy as np
from crow import run_feature_processing_pipeline, apply_crow_aggregation, apply_ucrow_aggregation, normalize
def get_nn(x, data, k=None):
"""
Find the k top indices and distances of index data vectors from query vector x.
:param ndarray x:
the query vector
:param ndarray data:
the index vectors
:param int k:
optional k to truncate return
:returns ndarray idx:
the indices of index vectors in ascending order of distance
:returns ndarray dists:
the squared distances
"""
if k is None:
k = len(data)
dists = ((x - data)**2).sum(axis=1)
idx = np.argsort(dists)
dists = dists[idx]
return idx[:k], dists[:k]
def simple_query_expansion(Q, data, inds, top_k=10):
"""
Get the top-k closest vectors, average and re-query
:param ndarray Q:
query vector
:param ndarray data:
index data vectors
:param ndarray inds:
the indices of index vectors in ascending order of distance
:param int top_k:
the number of closest vectors to consider
:returns ndarray idx:
the indices of index vectors in ascending order of distance
:returns ndarray dists:
the squared distances
"""
Q += data[inds[:top_k],:].sum(axis=0)
return normalize(Q)
def load_features(feature_dir, verbose=True):
"""
Iterate and load numpy pickle files in the provided directory along with the filename.
:param feature_dir: directory to iterate or list of directories
:type feature_dir: str or list
:param bool verbose:
optional flag to disabled progress printout
:yields ndarray X:
the ndarray from the pickle file
:yields str name:
the name of the file without file extension
"""
if type(feature_dir) == str:
feature_dir = [feature_dir]
for directory in feature_dir:
for i, f in enumerate(os.listdir(directory)):
name = os.path.splitext(f)[0]
# Print progress
if verbose and not i % 100:
sys.stdout.write('\rProcessing file %i' % i)
sys.stdout.flush()
X = np.load(os.path.join(directory, f))
yield X, name
sys.stdout.write('\n')
sys.stdout.flush()
def load_and_aggregate_features(feature_dir, agg_fn):
"""
Given a directory of features as numpy pickles, load them, map them
through the provided aggregation function, and return a list of
the features and a list of the corresponding file names without the
file extension.
:param feature_dir: directory to iterate or list of directories
:type feature_dir: str or list
:param callable agg_fn:
map function for raw features
:returns list features:
the list of loaded features
:returns list names:
corresponding file names without extension
"""
print 'Loading features %s ...' % str(feature_dir)
features = []
names = []
for X, name in load_features(feature_dir):
names.append(name)
X = agg_fn(X)
features.append(X)
return features, names
def get_ap(inds, dists, query_name, index_names, groundtruth_dir, ranked_dir=None):
"""
Given a query, index data, and path to groundtruth data, perform the query,
and evaluate average precision for the results by calling to the compute_ap
script. Optionally save ranked results in a file.
:param ndarray inds:
the indices of index vectors in ascending order of distance
:param ndarray dists:
the squared distances
:param str query_name:
the name of the query
:param list index_names:
the name of index items
:param str groundtruth_dir:
directory of groundtruth files
:param str ranked_dir:
optional path to a directory to save ranked list for query
:returns float:
the average precision for this query
"""
if ranked_dir is not None:
# Create dir for ranked results if needed
if not os.path.exists(ranked_dir):
os.makedirs(ranked_dir)
rank_file = os.path.join(ranked_dir, '%s.txt' % query_name)
f = open(rank_file, 'w')
else:
f = NamedTemporaryFile(delete=False)
rank_file = f.name
f.writelines([index_names[i] + '\n' for i in inds])
f.close()
groundtruth_prefix = os.path.join(groundtruth_dir, query_name)
cmd = './compute_ap %s %s' % (groundtruth_prefix, rank_file)
ap = os.popen(cmd).read()
# Delete temp file
if ranked_dir is None:
os.remove(rank_file)
return float(ap.strip())
def fit_whitening(whiten_features, agg_fn, d):
"""
Calculate whitening parameters
:param str whiten_features:
directory of features to fit whitening
:param callable agg_fn:
aggregation function
:param int d:
final feature dimension
:returns dict params:
a dict of transformation parameters
"""
# Load features for fitting whitening
data, _ = load_and_aggregate_features(whiten_features, agg_fn)
# Whiten, and reduce dim of features
# Whitening is trained on the same images that we query against here for expediency
print 'Fitting PCA/whitening wth d=%d on %s ...' % (d, whiten_features)
_, whiten_params = run_feature_processing_pipeline(data, d=d)
return whiten_params
def run_eval(queries_dir, groundtruth_dir, index_features, whiten_params, out_dir, agg_fn, qe_fn=None):
"""
Run full evaluation pipeline on specified data.
:param str queries_dir: directory of query features
:param str groundtruth_dir: directory of groundtruth info
:param index_features: directory or list of directories of index features
:type index_features: str or list
:param str whiten_features: directory of features to fit whitening
:param str out_dir: directory to save query results
:param callable agg_fn: aggregation function
:param callable qe_fn: query expansion function
"""
data, image_names = load_and_aggregate_features(index_features, agg_fn)
data, _ = run_feature_processing_pipeline(np.vstack(data), params=whiten_params)
# Iterate queries, process them, rank results, and evaluate mAP
aps = []
for Q, query_name in load_features(queries_dir):
Q = agg_fn(Q)
# Normalize and PCA to final feature
Q, _ = run_feature_processing_pipeline(Q, params=whiten_params)
inds, dists = get_nn(Q, data)
# perform query_expansion
if qe_fn is not None:
Q = qe_fn(Q, data, inds)
inds, dists = get_nn(Q, data)
ap = get_ap(inds, dists, query_name, image_names, groundtruth_dir, out_dir)
aps.append(ap)
return np.array(aps).mean()
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--wt', dest='weighting', type=str, default='crow', help='weighting to apply for feature aggregation')
parser.add_argument('--index_features', dest='index_features', type=str, default='oxford/pool5', help='directory containing raw features to index')
parser.add_argument('--whiten_features', dest='whiten_features', type=str, default='paris/pool5', help='directory containing raw features to fit whitening')
parser.add_argument('--queries', dest='queries', type=str, default='oxford/pool5_queries/', help='directory containing image files')
parser.add_argument('--groundtruth', dest='groundtruth', type=str, default='oxford/groundtruth/', help='directory containing groundtruth files')
parser.add_argument('--d', dest='d', type=int, default=128, help='dimension of final feature')
parser.add_argument('--out', dest='out', type=str, default=None, help='optional path to save ranked output')
parser.add_argument('--qe', dest='qe', type=int, default=0, help='perform query expansion with this many top results')
args = parser.parse_args()
# Select which aggregation function to apply
if args.weighting == 'crow':
agg_fn = apply_crow_aggregation
else:
agg_fn = apply_ucrow_aggregation
if args.qe > 0:
qe_fn = partial(simple_query_expansion, top_k=args.qe)
else:
qe_fn = None
# compute whitening params
whitening_params = fit_whitening(args.whiten_features, agg_fn, args.d)
# compute aggregated features and run the evaluation
mAP = run_eval(args.queries, args.groundtruth, args.index_features, whitening_params, args.out, agg_fn, qe_fn)
print 'mAP: %f' % mAP
exit(0)