-
Notifications
You must be signed in to change notification settings - Fork 1
/
keyed2indexed.py
31 lines (25 loc) · 1.26 KB
/
keyed2indexed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import wisse
from gensim.models.keyedvectors import KeyedVectors as vDB
import argparse
import logging
# sys.argv[1]: Input embeddings model (w2v format)
# sys.argv[2]: Output direcory for indexed format
# sys.argv[3]: Input format (default: binary)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
load_vectors = vDB.load_word2vec_format
parser = argparse.ArgumentParser()
parser.add_argument("--input", help = "Input embeddings model (w2v format)",
required = True)
parser.add_argument("--output", help = "Output direcory for indexed format",
default = 'output_indexed')
parser.add_argument("--txt", help = "Toggles text word2vec format input format "
"(default: binary)",
action='store_false')
args = parser.parse_args()
binary = args.binary
embedding = load_vectors(args.input, binary=binary, encoding = "latin-1")
logging.info("Indexing embeddings, this will take a while...\n")
wisse.keyed2indexed(embedding, args.output)
logging.info("Embeddings indexed, please verify the contents of the output "
"directory:\n %s\n" % args.output)