-
Notifications
You must be signed in to change notification settings - Fork 150
/
vw-varinfo2
executable file
·480 lines (389 loc) · 12.8 KB
/
vw-varinfo2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ts=4 sw=4 expandtab
"""
vw-varinfo2: vw dataset summary & variable importance
This is new & simpler implementation of the original vw-varinfo
It is designed to be simpler and faster. There's less dependence
on command line options so it is much more robust against future
changes and new options in vowpal-wabbit.
This implementation is in python (original was in perl)
TODO: multi-class support is not implemented!
Author: Ariel Faigon (2016)
"""
import sys
import os
import subprocess
import re
import itertools
import tempfile
import traceback
from operator import itemgetter
Verbose = False
ARGV0 = os.path.basename(sys.argv[0])
# Default vw executable program to call
VW = 'vw'
# Additional VW args which should be reused from 1st to 2nd pass
VWARGS = []
# Hash mappings for per-feature (min, max, hash-value, weight)
F_MIN = {}
F_MAX = {}
F_HASH = {}
F_WEIGHT = {}
# We need to have a model at the end to load all weights
# If it is not supplied on the command line, we add it ourselves
ModelName = ''
CleanupModel = False
# A global switch and list of all seen labels to support MultiClass
MultiClass = False
MCLabels = None
def v(msg):
"""print message to stderr"""
sys.stderr.write("%s\n" % msg)
sys.stderr.flush()
def d(msg):
"""Verbose/debugging message, activated with '-v' option."""
if not Verbose:
return
v(msg)
def fatal(msg):
"""fatal (can't continue) situation error message"""
v("== FATAL: %s" % msg)
sys.exit(1)
def usage(msg):
"""Print usage message and exit"""
if msg:
v(msg)
v("Usage: %s [-v] [<vw>] [<vw_options>] <vw_args>..." % ARGV0)
v(" Notes:\n"
"\t- You may omit the <vw> argument (default is 'vw')\n"
"\t- You may use a different <vw> executable as the 1st arg\n"
"\t- <vw_args> are all the vw arguments, as you would call vw directly\n"
"\t- If <vw_args> is just a dataset-file - vw defaults will be used\n"
"\t- To lose the constant (intercept), use vw's '--noconstant' option\n"
"\t However the constant may be useful to show if there's a bias")
sys.exit(1)
def which(program):
"""
Find a program in $PATH
If found, return its full path, otherwise return None
"""
def is_exe(fpath):
"""Return True if fpath is executable, False otherwise"""
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, _ = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def all_features_dicts():
"""
Returns two dict of all features in a structured way:
1st dict is individual features: scalar keys with a value of 1
2nd dict is for features within name-spaces, key is the name-space
first dict is:
{
# individual (not in a name-space) features:
"f1": 1,
"f2": 1,
"fN": 1
}
second dict is:
{
# features in name-spaces:
"namespace1": { "f1":1, "f2":1, ... },
"namespace2": {"f1":1, "f2":1, ... },
}
"""
d1 = {}
d2 = {}
for k in F_HASH:
if '^' in k:
ns, fname = k.split('^', 1)
if ns not in d2:
d2[ns] = {}
d2[ns][fname] = 1
else:
# Constant feature should never be added as a regular
# feature. vw adds it by itself as needed.
# TODO: multiclass uses separate Constant_<N> per class
if k != 'Constant':
d1[k] = 1
return d1, d2
def all_features_example():
"""Return a equal-weight vw line with all features present"""
# TODO: implement multi-class: needs per-class internal data-structs
d1, d2 = all_features_dicts()
individual_features = ' | ' + ' '.join(d1.keys())
ns_features = []
for ns in d2:
fnames = d2[ns].keys()
one_ns_features = " |%s %s" % (ns, ' '.join(fnames))
ns_features.append(one_ns_features)
example = '1' + individual_features + ' '.join(ns_features) + '\n'
d("all_features_example: %s" % example)
return example
def process_audit_line(line):
"""
Process an audit line coming from 'vw'
Track min/max/hash-value/weight for each feature
"""
features = line.split("\t")
features.pop(0)
for f in features:
fields = f.split(':')
fname = fields[0]
if fname == '':
# don't process 'empty' features
continue
fhash = int(fields[1])
fval = float(fields[2])
fweight = float(fields[-1].split('@')[0])
F_WEIGHT[fname] = fweight
F_HASH[fname] = fhash
if fname not in F_MIN:
# feature seen for 1st time
F_MIN[fname] = fval
F_MAX[fname] = fval
if fval < F_MIN[fname]:
F_MIN[fname] = fval
if fval > F_MAX[fname]:
F_MAX[fname] = fval
def vw_audit(vw_cmd, our_input=None):
"""
Generator for vw audit-lines
(Each example is mapped to its audit-line)
vw_cmd is a list of args to run vw with (vw command line)
There are two modes of running:
1) Normal: input provided directly to vw from command line
2) 2nd pass: input provided by vw-varinfo as a string
This mode is activated when our_input="some string..."
"""
if our_input:
# Input comes from our_input (string)
# which is sent via stdin to the vw subprocess
vw_proc = subprocess.Popen(
vw_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE,
bufsize=1048576
)
# python3 expects a bytes-like object
# Hence encoding the string our_input
vw_proc.stdin.write(our_input.encode())
vw_proc.stdin.close()
else:
# By default, vw reads from a training-set
# which is provided on the command line
vw_proc = subprocess.Popen(
vw_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
close_fds=False,
bufsize=1048576
)
example_no = 0
while True:
# Since encoded string was written
# therefore it needs to be decoded
vw_line = vw_proc.stdout.readline().decode()
if not vw_line:
# End of input
vw_proc.stdout.close()
vw_proc.wait()
if vw_proc.returncode:
# non-zero exit code, print the full command that
# failed to help user reproduce/understand it
fatal("vw subprocess failed (status=%s):\n\t%s\n"
"(Run the command above to reproduce the failure)" %
(vw_proc.returncode, ' '.join(vw_cmd)))
else:
# everything looks cool, support debugging anyway
d("%s: %s examples, exit status: %s" %
(vw_cmd, example_no, vw_proc.returncode))
return
# An audit line is recognized by a leading-tab
if vw_line[0] == '\t':
# An audit line (what we're looking for)
example_no += 1
d(vw_line)
yield vw_line
continue
# print("vw_line=[%s] len=%d not(vw_line)=%s" % (vw_line, len(vw_line), (not vw_line)))
# time.sleep(0.0001)
# Q: anything we want to do with other lines?
# A: for now no, we just read the next line from vw
return
def run_vw(vw_cmd, our_input=None):
"""Track all variables and their weights via vw --audit lines"""
for line in vw_audit(vw_cmd, our_input):
process_audit_line(line)
def is_vw_arg(arg):
"""
Return True iff the arg looks like a 'vw' argument
Side effect: modifies the VW global variable iff user uses
a different vw
"""
global VW
if arg == VW:
return True
if re.search(r'(?:^|[\\/])vw[-_.0-9]*(\.exe)?$', arg):
VW = arg
return True
return False
def already_has_audit(args):
"""Return True iff args already include --audit (or -a)"""
if '-a' in args or '--audit' in args:
return True
return False
def is_multiclass(args):
"""
Check args for any hint of a multiclass problem
(Check is option dependent and may be incomplete)
"""
# Not sure if --wap, --ect multi-class are actually right
for mc_opt in ('--oaa', '--csoaa', '--ect', '--wap', '--sequence'):
if mc_opt in args:
return True
return False
def model_arg(args):
"""Return the model arg if any"""
f_idx = None
try:
f_idx = args.index('-f')
except:
# not there
return None
try:
f_idx += 1
model = args[f_idx]
except:
fatal("Oops! -f withot an arg - can't continue")
return model
def get_vw_cmd(args):
"""
Return the vw command we want to run
This means stripping our own (vw-varinfo) name from the list
and making sure:
1) That we have 'vw' at the beginning
2) That -a is added for auditing
"""
global ModelName, CleanupModel, Verbose
if len(args) <= 1:
usage('')
# -- move ourselves (vw-varinfo arg) out of the way
args.pop(0)
# 1st arg can be '-v' for debugging this script
if len(args) > 0 and args[0] == '-v':
Verbose = True
args.pop(0)
vw_args = []
if len(args) < 1:
usage('Too few args: %s' % args)
if not is_vw_arg(args[0]):
if os.name == 'nt':
args.insert(0, 'vw.exe')
else:
args.insert(0, 'vw')
if not already_has_audit(args):
args.insert(1, '--audit')
if '--noconstant' in args:
VWARGS.append('--noconstant')
model = model_arg(args)
if model:
ModelName = model
else:
ModelName = tempfile.mktemp(suffix='.vwmodel')
args.insert(1, ModelName)
args.insert(1, '-f')
CleanupModel = True
# TODO: skip leading options that are intended for vw-varinfo itself
for arg in args:
vw_args.append(arg)
d("vw_cmd is: %s" % vw_args)
vw_exe = vw_args[0]
if which(vw_exe) is None:
fatal("Sorry: can't find %s (vowpal wabbit executable) in $PATH\n"
"PATH=%s" % (vw_exe, os.environ["PATH"]))
return vw_args
def minmax(data):
"""
Return a pair (min, max) of list arg
"""
lo = 0
hi = 0
for i in data:
if i > hi:
hi = i
if i < lo:
lo = i
return lo, hi
def summarize():
"""Output summary of variables"""
wmin, wmax = minmax(F_WEIGHT.values())
w_absmax = max(abs(wmin), abs(wmax))
# Print a header
print(("%-16s\t%10s\t%s\t%s\t%s\t%s" %
('FeatureName', 'HashVal', 'MinVal', 'MaxVal', 'Weight', 'RelScore')))
# TODO: implement multi-class
# multi-class needs per-class internal data-structs
# To reverse-order add: 'reverse=True' arg to 'sorted'
# itemgetter: (0): sort-by-key, (1): sort by value
sorted_tuples = sorted(F_WEIGHT.items(), key=itemgetter(1))
for fname, _ in sorted_tuples:
fmin = float(F_MIN[fname])
fmax = float(F_MAX[fname])
fweight = float(F_WEIGHT[fname])
fhash = F_HASH[fname]
relscore = 100.0 * (fweight/w_absmax if w_absmax > 0 else 0.0)
print(("%-16s\t%10s\t%.2f\t%.2f\t%.2f\t%7.2f" %
(fname, fhash, fmin, fmax, fweight, relscore)))
def pass1(vw_cmd):
"""In pass1 we run 'vw' as in the original command"""
d("Starting PASS 1 ...")
run_vw(vw_cmd, None)
def pass2():
"""
Run a 2nd pass with all features and stored model
To get the final weights for all features
"""
vw_cmd = [VW, '--quiet', '-t', '-a', '-i', ModelName]
if len(VWARGS) > 0:
vw_cmd += VWARGS
all_features = all_features_example()
d("Starting PASS 2 ...")
run_vw(vw_cmd, all_features)
#
# -- main
#
def main():
"""Main func for vw-varinfo2: dataset feature information summary"""
global MultiClass, MCLabels
try:
vw_cmd = get_vw_cmd(sys.argv)
if is_multiclass(vw_cmd):
# multi-class needs per-class internal data-structs
MultiClass = True
MCLabels = []
# Run 1st pass to collect data on all features:
pass1(vw_cmd)
# Run second pass:
# with -i ModelName and single example w/ all-features present
pass2()
summarize()
if CleanupModel:
d("removing tempfile: %s" % ModelName)
os.remove(ModelName)
except Exception as estr:
# catch-all to cover all unhandled exceptions
fatal("%s\n%s" % (estr, traceback.format_exc()))
return 0
if __name__ == "__main__":
sys.exit(main())