-
Notifications
You must be signed in to change notification settings - Fork 8
/
preprocess.py
executable file
·91 lines (72 loc) · 2.78 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
import argparse
import json
import logging
from collections import defaultdict, Counter
from common import load_json, dump_json
import preprocessors
import settings
LOGGER = logging.getLogger('cve-score')
PREPROCESSORS = {
'flatten': preprocessors.FlatMap,
'tokenize': preprocessors.Tokenizer,
'binarize': preprocessors.Binarizer,
'identity': preprocessors.PassThrough,
}
_TOKEN_COUNTS = defaultdict(Counter)
def get_preprocessor(config):
'''Factory method to instantiate a class in the `preprocessors` module,
tied to a particular key.
Arguments:
config: [dict] specifying the "key," "preprocessor" type, and
additional constructor arguments.
Returns a pair (key, preprocessor) on success, consisting of a string and
a lambda function. Returns `None` on failure.
'''
preproc_name = config.get('preprocessor')
if preproc_name not in PREPROCESSORS:
LOGGER.warning('unrecognized preprocessor: %s', preproc_name)
return None
key = config['key']
constructor = PREPROCESSORS[preproc_name]
try:
return (key, constructor(**config))
except Exception as ex:
LOGGER.warning('failure to instantiate "%s": %s', preproc_name, ex)
return None
def preprocess(preprocessors, records):
'''Iterator transforming a stream of raw JSON records into a stram
of preprocessed records.
Arguments:
preprocessors: list of (key, lambda) pairs.
records: `file` object of line-separated JSON records.
Returns an iterator of dictionaries.
'''
for record in map(json.loads, records):
result = {}
for (key, function) in preprocessors:
value = function(record.get(key))
result[key] = value
if isinstance(value, list):
_TOKEN_COUNTS[key].update(set(value))
yield result
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('config', help='JSON processing config')
parser.add_argument('infile', help='raw JSON records, one per line')
parser.add_argument('outfile')
parser.add_argument('--vocabulary',
help='target JSON file of token-count mappings')
parser.add_argument('--logging', help='JSON logging config')
args = parser.parse_args()
settings.configure_logging(args.logging)
config = load_json(args.config)
preprocessors = list(filter(None,
[get_preprocessor(item) for item in config]))
with open(args.infile) as f_in:
with open(args.outfile, 'w') as f_out:
LOGGER.info('writing %s', args.outfile)
for record in preprocess(preprocessors, f_in):
f_out.write('%s\n' % json.dumps(record))
if args.vocabulary:
dump_json(args.vocabulary, _TOKEN_COUNTS)