-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathhownet_corpus_data_picker.py
37 lines (36 loc) · 1.37 KB
/
hownet_corpus_data_picker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys;
#reload(sys)
#sys.setdefaultencoding="utf-8"
if (len(sys.argv)<4):
print('no enough parameter')
exit();
hownet_filename = sys.argv[1];
embedding_filename = sys.argv[2];
target_filename = sys.argv[3];
with open(hownet_filename,'r',encoding='utf-8') as hownet:
with open(embedding_filename,'r',encoding='utf-8') as embedding:
with open(target_filename,'w',encoding='utf-8') as target:
wordsBuf = embedding.readlines();
dim_size = int(wordsBuf[0].strip().split()[1])
dic = hownet.readlines();
wordlen = len(wordsBuf) ;
words = {};
for i in range(1,wordlen):
line = wordsBuf[i].strip().split();
words[line[0].strip()] = i;
index = 0;
diclen = len(dic);
Strings = [];
#f = open("hownet_simple","w",encoding='utf-8')
while(index<diclen):
now = dic[index].strip();
if (now in words):
#target.write(wordsBuf[words[now]]);
Strings.append(wordsBuf[words[now]]);
#f.write(dic[index])
#f.write(dic[index+1])
index+=2;
target.write(str(len(Strings))+" "+str(dim_size)+"\n")
for line in Strings:
target.write(line);
#f.close()