-
Notifications
You must be signed in to change notification settings - Fork 14
/
process_data.py
142 lines (122 loc) · 4.87 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pickle
import re
import sys
import numpy as np
import pandas as pd
from collections import defaultdict
from docopt import docopt
np.random.seed(3306)
# loads data and split into 10 folds.
def build_data_cv(filename, label, sentences, vocab, cv=10, clean_string=True):
with open(filename, 'r') as f:
for line in f:
rev = [line.strip()]
if clean_string:
orig_rev = clean_str(' '.join(rev))
else:
orig_rev = ' '.join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {'y': label,
'text': orig_rev,
'num_words': len(orig_rev.split()),
'split': np.random.randint(0, cv)}
sentences.append(datum)
# get word matrix. W[i] is the vector for word indexed by i.
def get_W(word_vecs, k=300):
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
W[0] = np.zeros(k, dtype='float32')
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map
# loads 300x1 word vectors from file.
def load_bin_vec(fname, vocab):
word_vecs = {}
with open(fname, 'rb') as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in range(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == b' ':
word = b''.join(word)
break
if ch != b'\n':
word.append(ch)
word = str(word, 'UTF-8')
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
return word_vecs
# add random vectors of unknown words which are not in pre-trained vector file.
# if pre-trained vectors are not used, then initialize all words in vocab with random value.
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25, 0.25, k)
# clean data.
def clean_str(string):
string = re.sub(r'[^A-Za-z0-9(),!?\'`]', ' ', string)
string = re.sub(r'\'s', ' \'s', string)
string = re.sub(r'\'ve', ' \'ve', string)
string = re.sub(r'n\'t', ' n\'t', string)
string = re.sub(r'\'re', ' \'re', string)
string = re.sub(r'\'d', ' \'d', string)
string = re.sub(r'\'ll', ' \'ll', string)
string = re.sub(r',', ' , ', string)
string = re.sub(r'!', ' ! ', string)
string = re.sub(r'\(', ' \( ', string)
string = re.sub(r'\)', ' \) ', string)
string = re.sub(r'\?', ' \? ', string)
string = re.sub(r'\s{2,}', ' ', string)
return string.strip().lower()
# main function.
def main():
args = docopt('''
Usage:
process_data.py <vectors_file>
''')
print('############')
print('process data')
print('############')
vectors_file = args['<vectors_file>'] # pre-trained word vectors file
data_folder = ['rt-polarity.neg', 'rt-polarity.pos'] # data files
datafile = 'mr.p' # save data and word vectors
print('Loading Data...')
sentences = [] # sentences processed
vocab = defaultdict(float) # vocabulary
# process data
build_data_cv(data_folder[0], 0, sentences, vocab, cv=10, clean_string=True)
build_data_cv(data_folder[1], 1, sentences, vocab, cv=10, clean_string=True)
np.random.shuffle(sentences)
maxlen = np.max(pd.DataFrame(sentences)['num_words']) # max length of sentences
print('Data Loaded!')
print('Number Of Sentences: ' + str(len(sentences)))
print('Vocab Size: ' + str(len(vocab)))
print('Max Sentence Length: ' + str(maxlen))
print('Loading Vectors...')
vectors = load_bin_vec(vectors_file, vocab) # pre-trained vectors
print('Vectors Loaded!')
print('Words Already In Vectors: ' + str(len(vectors)))
# add random vectors of words which are not in vocab.
add_unknown_words(vectors, vocab)
W, word_idx_map = get_W(vectors) # vectors of all words and a map of words to ids
rand_vecs = {} # random vectors of all words
# vectors of words with random values
add_unknown_words(rand_vecs, vocab)
W2, _ = get_W(rand_vecs) # random vectors of all words which are related to ids
# save sentences and vectors
pickle.dump([sentences, W, W2, word_idx_map, vocab, maxlen], open(datafile, 'wb'))
print('Dataset created!')
# entry point.
if __name__ == '__main__':
main()