-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathencode.py
executable file
·93 lines (74 loc) · 2.78 KB
/
encode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
import argparse
import joblib
import json
import logging
from collections import defaultdict
from common import load_json, serialize
import encoders
import settings
LOGGER = logging.getLogger('cve-score')
META_KEY = 'META'
ENCODERS = {
'dense': encoders.DenseEncoder,
'sparse': encoders.SparseEncoder,
'numeric': encoders.NumericEncoder,
'embedding': encoders.EmbeddingEncoder,
}
def get_encoder(config, vocabulary):
'''Factory method to instantiate a class in the `encoders` module,
tied to a particular key.
Arguments:
config: [dict] specifying the "key," "encoder" type, and additional
constructor arguments.
vocabulary: [dict] mapping keys to token-frequency mappings.
Returns a pair (key, encoder) on success, consisting of a string and a
lambda function. Returns `None` on failure.
'''
encoder_name = config.get('encoder')
if encoder_name is None:
return None
if encoder_name not in ENCODERS:
LOGGER.warning('unrecognized encoder: %s', encoder_name)
return None
key = config['key']
constructor = ENCODERS[encoder_name]
kwargs = dict(vocabulary=vocabulary.get(key), **config)
try:
return (key, constructor(**kwargs))
except (ValueError, TypeError) as ex:
LOGGER.warning('failure to instantiate "%s": %s', encoder_name, ex)
return None
def encode(encoders, records):
'''Converts a stream of JSON records into a dictionary of numpy arrays.
Arguments:
encoders: list of (key, encoder) pairs, where the right hand value
is an instance from the `encoders` module.
records: file of line-separated JSON records.
Returns a dictionary mapping "keys" from encoders list to numpy objects.
'''
indices = defaultdict(list)
for record in map(json.loads, records):
for (key, encoder) in encoders:
indices[key].append(encoder(record[key]))
return {
key: encoder.transform(indices[key])
for (key, encoder) in encoders
}
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('config', help='JSON processing config')
parser.add_argument('vocabulary',
help='JSON file of token-frequency mappings')
parser.add_argument('infile', help='raw JSON records, one per line')
parser.add_argument('outfile', help='pickled dict of numpy arrays')
args = parser.parse_args()
settings.configure_logging()
config = load_json(args.config)
vocabulary = load_json(args.vocabulary)
encoders = list(filter(None,
[get_encoder(item, vocabulary) for item in config]))
with open(args.infile) as fh:
LOGGER.info('reading %s', args.infile)
data = encode(encoders, fh)
serialize(args.outfile, data)