-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdata_processor.py
138 lines (122 loc) · 5.81 KB
/
data_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
"""
Created on Thu May 21 19:19:01 2020
读取数据并对数据做预处理
统计出训练数据中出现频次最多的5k个单词,用这出现最多的5k个单词创建词表(词向量)
对于测试数据,直接用训练数据构建的词表
@author:
"""
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
torch.manual_seed(123)
class DataProcessor(object):
def read_text(self,is_train_data):
#读取原始文本数据
#is_train_data==True表示读取训练数据
#is_train_data==False表示读取测试数据
datas = []
labels = []
if(is_train_data):
#训练数据目录
pos_path = "./datasets/aclImdb/train/pos/"
neg_path = "./datasets/aclImdb/train/neg/"
else:
#测试数据目录
pos_path = "./datasets/aclImdb/test/pos/"
neg_path = "./datasets/aclImdb/test/neg/"
pos_files= os.listdir(pos_path) #获取文件夹下的所有文件名称
neg_files = os.listdir(neg_path)
for file_name in pos_files: #遍历文件夹
file_position = pos_path + file_name
with open(file_position, "r",encoding='utf-8') as f: #打开文件
data = f.read() #读取文件
datas.append(data)
labels.append([1,0]) #正类标签维[1,0]
for file_name in neg_files:
file_position = neg_path + file_name
with open(file_position, "r",encoding='utf-8') as f:
data = f.read()
datas.append(data)
labels.append([0,1]) #负类标签维[0,1]
return datas, labels
def word_count(self, datas):
#统计单词出现的频次,并将其降序排列,得出出现频次最多的单词
dic = {}
for data in datas:
data_list = data.split()
for word in data_list:
word = word.lower() #所有单词转化为小写
if(word in dic):
dic[word] += 1
else:
dic[word] = 1
word_count_sorted = sorted(dic.items(), key=lambda item:item[1], reverse=True)
return word_count_sorted
def word_index(self, datas, vocab_size):
#创建词表
word_count_sorted = self.word_count(datas)
word2index = {}
#词表中未出现的词
word2index["<unk>"] = 0
#句子添加的padding
word2index["<pad>"] = 1
#词表的实际大小由词的数量和限定大小决定
vocab_size = min(len(word_count_sorted), vocab_size)
for i in range(vocab_size):
word = word_count_sorted[i][0]
word2index[word] = i + 2
return word2index, vocab_size
def get_datasets(self, vocab_size, embedding_size, max_len):
#注,由于nn.Embedding每次生成的词嵌入不固定,因此此处同时获取训练数据的词嵌入和测试数据的词嵌入
#测试数据的词表也用训练数据创建
train_datas, train_labels = self.read_text(is_train_data=True)
word2index, vocab_size = self.word_index(train_datas, vocab_size)
test_datas, test_labels = self.read_text(is_train_data = False)
train_features = []
for data in train_datas:
feature = []
data_list = data.split()
for word in data_list:
word = word.lower() #词表中的单词均为小写
if word in word2index:
feature.append(word2index[word])
else:
feature.append(word2index["<unk>"]) #词表中未出现的词用<unk>代替
if(len(feature)==max_len): #限制句子的最大长度,超出部分直接截断
break
#对未达到最大长度的句子添加padding
feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
train_features.append(feature)
test_features = []
for data in test_datas:
feature = []
data_list = data.split()
for word in data_list:
word = word.lower() #词表中的单词均为小写
if word in word2index:
feature.append(word2index[word])
else:
feature.append(word2index["<unk>"]) #词表中未出现的词用<unk>代替
if(len(feature)==max_len): #限制句子的最大长度,超出部分直接截断
break
#对未达到最大长度的句子添加padding
feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
test_features.append(feature)
#将词的index转换成tensor,train_features中数据的维度需要一致,否则会报错
train_features = torch.LongTensor(train_features)
train_labels = torch.FloatTensor(train_labels)
test_features = torch.LongTensor(test_features)
test_labels = torch.FloatTensor(test_labels)
#将词转化为embedding
#词表中有两个特殊的词<unk>和<pad>,所以词表实际大小为vocab_size + 2
embed = nn.Embedding(vocab_size + 2, embedding_size)
train_features = embed(train_features)
test_features = embed(test_features)
#指定输入特征是否需要计算梯度
train_features = Variable(train_features, requires_grad=False)
train_datasets = torch.utils.data.TensorDataset(train_features, train_labels)
test_features = Variable(test_features, requires_grad=False)
test_datasets = torch.utils.data.TensorDataset(test_features, test_labels)
return train_datasets, test_datasets