This repository has been archived by the owner on Oct 18, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 129
/
operate_data.py
275 lines (245 loc) · 9.16 KB
/
operate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
"""
集合了清洗数据的函数,例如:简体繁体转化、文本向量化等
"""
#encoding:utf-8
import os
import pickle
import math
import jieba
import jieba.posseg as pseg
import numpy as np
try:
from .clean_data.clean_html import cleanHtml
except Exception as error:
from clean_data.clean_html import cleanHtml
try:
from .clean_data.langconv import *
except Exception as error :
from clean_data.langconv import *
dictPath = os.path.join('data','emdict','userdict') # 针对linux兼容
jieba.load_userdict(dictPath) # 加载个人词典
stopwordsPath = os.path.join('data','emdict','stopword.plk') # 停用词
negwordsPath = os.path.join('data','emdict','negword.plk') # 消极词
poswordsPath = os.path.join('data','emdict','posword.plk') # 积极词
documentPath = os.path.join('data','trainset') # 训练样本的目录
stopList = []
emotionList = []
posList = []
negList = []
wordsList = [] # 基于所有训练样本的词袋(针对 不基于词典 这种文本向量化方法)
docList = [] # 所有文档的词语组成的2维词列表(tf-idf所需要的词列表)
# 转换繁体到简体
def cht_to_chs(line):
line = Converter('zh-hans').convert(line)
line.encode('utf-8')
return line
# 转换简体到繁体
def chs_to_cht(line):
line = Converter('zh-hant').convert(line)
line.encode('utf-8')
return line
def clearNews(news,mode=False):
'''
新闻格式一体化
:param news: 包括繁体,网页格式
:param mode: 默认是繁体->简体
:return: 清洗后的标准格式news
'''
if not mode:
return cleanHtml(cht_to_chs(news))
else:
return cleanHtml(chs_to_cht(news))
# 加载停用词
def loadStopwords(path = stopwordsPath):
global stopList
with open(stopwordsPath,'rb') as f:
stopList = pickle.load(f)
# 加载感情词
def loadEmotionwords(*paths):
global posList,emotionList,negList
if not len(paths):
with open(negwordsPath,'rb') as f:
negList = pickle.load(f)
# t.extend(pickle.load(f))
with open(poswordsPath,'rb') as f:
posList = pickle.load(f)
# t.extend(pickle.load(f))
emotionList = posList+negList
else:
for path in paths:
with open (path,'rb') as f:
emotionList = pickle.load(f)
# 针对不基于词典
# 直接调用一次后,然后请调用 全局变量 wordsList
def loadWords(stopList,path=documentPath):
global wordsList
wordsSet = set()
for file in os.listdir(path):
news = None
with open(os.path.join(path,file),'r',encoding='utf-8',errors='ignore') as f:
news = f.read()
noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表
news = set(jieba.cut(news))
news = {word for word in news if (word not in stopList) and (word not in noun)} # 过滤停用词和名词
wordsSet = news | wordsSet # 取集合并集
# 最后要使用list类型,因为要保证结果的有序性
wordsList = list(wordsSet)
return None
# 针对TF-IDF
# 读取所有数据集的词,是的全局变量变成二维的List
def loadDocument(stopList,path=documentPath):
global docList
docList = []
for file in os.listdir(path):
news = None
with open(os.path.join(path,file),'r',encoding='utf-8') as f:
news = f.read()
noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')]
news = list(jieba.cut(news))
news = [word for word in news if (word not in stopList) and (word not in noun)] # 过滤停用词和名词
docList.append(news)
return None
def words2Vec(news,emotionList,stopList,posList,negList,mode=0):
"""
新闻文本翻译成词向量
:param news: 新闻文本
:param emotionList: 情感词列表
:param stopList: 停用词列表
:param posList: 积极词列表
:param negList: 消极词列表
:param mode: int and [0,5)。对应不同的翻译文本的方法
:return: list类型(方便之后的操作,例如,numpy.array())
"""
# 参数类型检查
assert isinstance(stopList,list) and isinstance(emotionList,list),"类型不对。Function 'word2vec' at OperateDat.py"
news = clearNews(news)
noun = [word for word,flag in pseg.lcut(news) if flag.startswith('n')] # 名词列表
# 过滤停用词和名词
newswords = list(jieba.cut(news))
newswords = [word for word in newswords if (word not in stopList) and (word not in noun)]
wordsVec = []
# one-hot
# time:O(n)
if mode==0:
for word in emotionList:
if word in newswords: wordsVec.append(1)
else: wordsVec.append(0)
# frequency
# time:O(n)
elif mode==1:
for word in emotionList:
wordsVec.append(newswords.count(word))
# two Vec
# time:O(2*n)
elif mode==2:
negTimes = 0;posTimes = 0
for word in posList:
posTimes+=(newswords.count(word))
for word in negList:
negTimes+=(newswords.count(word))
wordsVec.append(posTimes);wordsVec.append(negTimes)
# tf-idf
# time:O(2*n*n)
elif mode==3:
global docList # 引用加载后的全局变量
docSum = len(docList) # 第一维len代表了文件数
for word in emotionList:
TF = 0
IDF= 0
times = 0
for doc in docList:
if word in doc: times+=1
IDF = math.log10(docSum/abs(times+1))
times = 0
for doc in docList:
times+=doc.count(word)
TF = newswords.count(word)/(times+1)
wordsVec.append(TF*IDF)
# out-of-dict
# time:O(2*n)
elif mode==4:
global wordsList
for word in wordsList:
wordsVec.append(newswords.count(word))
return wordsVec
def dataNormal(vecArr):
'''
数据归一化
:param vecArr: array类型vec向量
:return: 归一化的词向量,减小影响。
'''
return (vecArr-vecArr.min())/(vecArr.max()-vecArr.min())
def randomData(xData,yData,w=0.25,logFile=None):
"""
随机生成训练集和测试集
:param xData: m*n narray.
:param yData: n narray.
:param w: 训练集和测试集分割的权重
:param logFile: n list. 记录每条数据的tag(例如文件名)
:return: 分割好的trainX,trainY,testX,testY[,logTrain,logTest]
"""
np.random.seed(0) # 为了使每次的结果可以比较,要设置一个种子数
if logFile:
assert len(logFile)==len(xData)==len(yData),'缺少维度 at OperateData.py'
else:
assert len(xData)==len(yData),'缺少维度 at OperateData.py'
length = len(xData)
indices = np.random.permutation(length) # 对[0:length]区间的整数随机排列得到对应的index
trainX = xData[indices[:(-1)*int(w*length)]] # 取出对应的index对应的元素
trainY = yData[indices[:(-1)*int(w*length)]]
testX = xData[indices[(-1)*int(w*length):]]
testY = yData[indices[(-1)*int(w*length):]]
logTrain = [logFile[i] for i in indices[:(-1)*int(w*length)]]
logTest = [logFile[i] for i in indices[(-1)*int(w*length):]]
if logFile:
return trainX,trainY,testX,testY,logTrain,logTest
return trainX,trainY,testX,testY
def twoTag(x_arr,y_arr):
"""
针对二分类方法
:param x_arr: m*n narray.
:param y_arr: n narray.
:return: 剔除中性样本后的新样本
"""
new_index = (y_arr != 0)
new_x = x_arr[new_index, :] # 所有中性样本
new_y = y_arr[new_index]
return new_x,new_y
if __name__=='__main__':
modes = 5 #一共5种word2vec方法
loadStopwords()
loadEmotionwords()
loadWords(stopList)
loadDocument(stopList)
resultX = []
resultY = []
logfile = [] # 留作bug
for doc in os.listdir(documentPath):
if doc[:3] in ('pos','neg','neu'):
logfile.append(doc)
# logfile存储每个文件id和对应tag
# 以后会用它计算结果3*3的矩阵
with open(os.path.join('result','log','logfile.plk'),'wb') as f:
pickle.dump(logfile,f) #存取
for mode in range(modes):
x = []
y = []
for doc in os.listdir(documentPath):
news = None
news_file_path = os.path.join(documentPath,doc)
if doc[:3] in ('neg','neu','pos'):
with open(news_file_path,'r',encoding='utf-8') as f:
news = f.read()
x.append(words2Vec(news,emotionList,stopList,posList,negList,mode=mode))
if doc.startswith('neg'):
y.append(-1)
elif doc.startswith('neu'):
y.append(0)
else:
y.append(1)
print('In', mode, news_file_path)
resultX.append(np.array(x))
resultY.append(np.array(y))
np.savez(os.path.join('result','vector','resultX.npz'),onehot = resultX[0],wordfreq=resultX[1],twovec=resultX[2],tfidf=resultX[3],outofdict=resultX[4])
np.savez(os.path.join('result','vector','resultY.npz'),onehot = resultY[0],wordfreq=resultY[1],twovec=resultY[2],tfidf=resultY[3],outofdict=resultY[4])
print('Over')