forked from ZhenYangIACAS/unsupervised-NMT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocab.py
45 lines (39 loc) · 1.33 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
#/usr/bin/python2
"""
June 2017 by kyubyong park.
kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/transformer
Modified by Chunqi Wang in July 2017.
"""
from __future__ import print_function
import codecs
import re as regex
import yaml
from argparse import ArgumentParser
from collections import Counter
from utils import AttrDict
def make_vocab(fpath, fname):
"""Constructs vocabulary.
Args:
fpath: A string. Input file path.
fname: A string. Output file name.
Writes vocabulary line by line to `fname`.
"""
word2cnt = Counter()
for l in codecs.open(fpath, 'r', 'utf-8'):
words = l.split()
word2cnt.update(Counter(words))
with codecs.open(fname, 'w', 'utf-8') as fout:
fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
for word, cnt in word2cnt.most_common(len(word2cnt)):
fout.write(u"{}\t{}\n".format(word, cnt))
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-c', '--config', dest='config')
args = parser.parse_args()
# Read config
config = AttrDict(yaml.load(open(args.config)))
make_vocab(config.train.src_path, config.src_vocab)
make_vocab(config.train.dst_path, config.dst_vocab)
print("Done")