-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathprocess_data_mr.py
95 lines (83 loc) · 3.08 KB
/
process_data_mr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
__author__ = 'mangate'
import numpy as np
import cPickle
from collections import defaultdict
import re
import pandas as pd
import os.path
from process_data_common import load_bin_vec, add_unknown_words, get_W
def build_data_cv(data_folder, cv=10, clean_string=True):
"""
Loads data and split into 10 folds.
"""
revs = []
pos_file = data_folder[0]
neg_file = data_folder[1]
vocab = defaultdict(float)
files = [neg_file, pos_file]
for i in range(len(files)):
with open(files[i], "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":i,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
return revs, vocab
def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip() if TREC else string.strip().lower()
def process_data(file_name):
if os.path.isfile(file_name):
print "file {} already exists".format(file_name)
return
print "creating dataset..."
# load data
print "loading data...",
data_folder = ["data/mr/rt-polarity.pos", "data/mr/rt-polarity.neg"]
revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
max_l = np.max(pd.DataFrame(revs)["num_words"])
print "data loaded!"
print "number of sentences: " + str(len(revs))
print "vocab size: " + str(len(vocab))
print "max sentence length: " + str(max_l)
# load word2vec
print "loading word2vec vectors...",
w2v_file = 'data/GoogleNews-vectors-negative300.bin'
w2v = load_bin_vec(w2v_file, vocab)
print "num words already in word2vec: " + str(len(w2v))
print "word2vec loaded!"
#Addind random vectors for all unknown words
add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
rand_vecs = {}
add_unknown_words(rand_vecs, vocab)
W2, _ = get_W(rand_vecs)
# dump to pickle file
cPickle.dump([revs, W, W2, word_idx_map, vocab, max_l], open(file_name, "wb"))
print "dataset created!"