-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing.py
59 lines (52 loc) · 1.53 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# import tensorflow as tf
import os
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle
# turn chinese word into idx token an build a dict
# with open("poem.txt", "r") as f:
# print(f.readline())
# dict = pickle.load(open("dict.pkl", 'rb'))
# print(max(dict.values()))
# poem = [3959, 1437 ,6313 ,1475, 1218 ,410, 5503 ,1017, 3683 ,1308, 4358, 2747, 536, 1278 ,5610, 3943, 1243 ,3679 ,6317 ,6525]
# # poem = [3951, 5009, 1101, 1619, 2238, 1018, 984,574,5816,2423,6186,6337,2783,2163,6472,1808,3165 ,113 ,6519 ,1437]
# word2id = np.load("dict.pkl")
# id2word = {v:k for k, v in word2id.items()}
# for idx in poem:
# print(id2word[idx])
# 5000 首唐诗
corpus = "qts_tab.txt"
poems = []
with open(corpus, 'r', encoding='utf-8') as f:
# tags = f.readline().strip().split(u'\t')
for l in f.readlines():
poem = l.strip()
# print(len(poem))
if len(poem) == 9:
poems.append(poem)
# print(len(l))
print(len(poems))
#
# print(poems)
poems = poems[:290000]
counter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
x = counter.fit_transform(poems)
print()
print()
xx = np.argwhere(x)
idx= xx[:, 1]
idx = idx.reshape(-1, 5)
print(idx.shape)
idx = idx[:20000, :]
# print(idx)
flip = np.fliplr(idx)
ts = flip.reshape(-1, 20)
print(len(ts))
np.savetxt("train.txt", ts, "%d")
# np.save("dict.npy", counter.vocabulary_)
print(type(counter.vocabulary_))
f = open("dict.pkl","wb")
pickle.dump(counter.vocabulary_,f)
f.close()
# dict : 'word': 12
# np.save("w2i.npy", dict)