This repository has been archived by the owner on Jul 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
text2bin.py
93 lines (74 loc) · 2.56 KB
/
text2bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
#
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converts vectors from text to a binary format for quicker manipulation.
Usage:
text2bin.py -o <out> -v <vocab> vec1.txt [vec2.txt ...]
Optiona:
-o <filename>, --output <filename>
The name of the file into which the binary vectors are written.
-v <filename>, --vocab <filename>
The name of the file into which the vocabulary is written.
Description
This program merges one or more whitespace separated vector files into a single
binary vector file that can be used by downstream evaluation tools in this
directory ("wordsim.py" and "analogy").
If more than one vector file is specified, then the files must be aligned
row-wise (i.e., each line must correspond to the same embedding), and they must
have the same number of columns (i.e., be the same dimension).
"""
from __future__ import print_function
from itertools import izip
from getopt import GetoptError, getopt
import os
import struct
import sys
try:
opts, args = getopt(
sys.argv[1:], 'o:v:', ['output=', 'vocab='])
except GetoptError as e:
print(e, file=sys.stderr)
sys.exit(2)
opt_output = 'vecs.bin'
opt_vocab = 'vocab.txt'
for o, a in opts:
if o in ('-o', '--output'):
opt_output = a
if o in ('-v', '--vocab'):
opt_vocab = a
def go(fhs):
fmt = None
line_num = 0
with open(opt_vocab, 'w') as vocab_out:
with open(opt_output, 'w') as vecs_out:
for lines in izip(*fhs):
line_num = line_num + 1
parts = [line.split() for line in lines]
token = parts[0][0]
if any(part[0] != token for part in parts[1:] or line[0] == ''):
# raise IOError('vector files must be aligned')
continue
print(token, file=vocab_out)
vec = [sum(float(x) for x in xs) for xs in zip(*parts)[1:]]
if not fmt:
fmt = struct.Struct('%df' % len(vec))
vecs_out.write(fmt.pack(*vec))
if args:
fhs = [open(filename) for filename in args]
go(fhs)
for fh in fhs:
fh.close()
else:
go([sys.stdin])