forked from HGUISEL/TIBigdataMiddleware
-
Notifications
You must be signed in to change notification settings - Fork 3
/
try1.py
74 lines (57 loc) · 1.94 KB
/
try1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from common import prs
ndoc = 1000
prsResult = prs.readyData(ndoc,True)
import pandas as pd
data = pd.DataFrame(list(prsResult), index = ["id","content","token","contents"]).T
from tensorflow import keras
model = keras.models.load_model('tib_topic_model')
for i in range(data.shape[0]):
data.loc[i,"token"] = " ".join(data["token"][i])
topicDummy = pd.read_csv('./topicDummy.csv')
data["topic"] = None
topicDummy = topicDummy.drop(topicDummy.columns[0], axis = 1)
print(topicDummy.columns)
import numpy as np
topicList = topicDummy.columns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 500
#tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters = " ")
import pickle
# loading
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
for i, cont in enumerate(data["token"]):
test = []
test.append(cont)
seq = tokenizer.texts_to_sequences(test)
padded = pad_sequences(seq, maxlen = MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
#pol innt soc cul eco spo
labels = topicList
# labels = ['pol', 'eco', 'cul', 'innt', 'spo', 'soc']
data.loc[i,"topic"] = labels[np.argmax(pred)]
for top in topicList:
print(data[data["topic"]==top][["token","topic"]].head(3),"\n")
for topic in topicList:
sumVal = (data["topic"]==topic).sum()
print(topic , " count : ", sumVal)
data = data.rename(columns = {"token" : "words"})
#-*- coding:utf-8 -*-
ctgResult = []
count = 0
for topic in topicList:
ctg = data[data["topic" ]== topic]#pandas comprehension
# doc = ctg.to_json(orient = "records",force_ascii=False)
doc = ctg.to_dict('records')
#print(type(doc))
catObj = {
"topic" : topic,
"doc" : doc
}
# print(catObj)
ctgResult.append(catObj)
import json
with open("lstm_result_with_"+str(ndoc)+".json", 'w', -1,encoding='utf8') as f:
json.dump(ctgResult,f,ensure_ascii=False)