-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEmbed.py
executable file
·124 lines (109 loc) · 5.37 KB
/
Embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from numpy import cos, arccos, sin, arctan, tan, pi, sqrt; from numpy import array as ary; import numpy as np; tau = 2*pi
import pandas as pd
import pickle
import time
UNLABELLED = -1
EMBED_BY_COUNTING = True
start_time = time.time()
try:
import os, sys
os.chdir(sys.argv[1])
if len(sys.argv[2:])>0:
exec(''.join(sys.argv[2:]))
except IndexError:
print("usage:")
print("'python "+sys.argv[0]+" <folder containing train_data.csv and test_data.csv>/'")
print("They can be automatically generated by Shuffle.py")
sys.exit()
try:
tr_frame = pd.read_csv("train_data.csv", header=None, index_col=0).fillna('')
te_frame = pd.read_csv("test_data.csv", header=None, index_col=0).fillna('')
if not 3 in te_frame.columns: #if column 3 is entirely unfilled:
te_frame[3]=''
except FileNotFoundError:
print("Please preprocess that directory with 'Shuffle.py' first;")
print("Or, alternatively, save the 'train_data.csv' and 'test_data.csv' directly.")
exit()
EMBED_BY_PRESENCE = not EMBED_BY_COUNTING
def get_unique_words(descriptions):
allwords = " ".join(list(descriptions)) #join them into a single string
allwords = allwords.lower().split(" ") #change all words to lower case to minimize duplications
uniquewords = list(set(allwords)) #turn it into an iterable set
uniquewords.sort()
uniquewords = [ i for i in uniquewords if (i!="" and i.isprintable())] #remove the empty string character from the set
print("{0} unique words is found and used as the features for learning.".format(len(uniquewords)))
return uniquewords #a list of all unique words
def save_inversion_dict(set_as_list, data="category"):
allcombo_dict = dict([ [ i,set_as_list[i] ] for i in range(len(set_as_list))]) #convert to a dictionary, which can be saved.
with open("inversion_"+data+".pkl","bw") as f:
pickle.dump(allcombo_dict, f)
return
def save_conversion_dict(set_as_list, data="category"):
allcombo_dict = dict([ [set_as_list[i], i] for i in range(len(set_as_list)) ])
with open("conversion_"+data+".pkl","bw") as f:
pickle.dump(allcombo_dict, f)
return
def get_set_of_categroies(pdseries):
s = set(pdseries)
s.discard('')
list_of_set_of_category = list(s)
list_of_set_of_category.sort()
return list_of_set_of_category
def convert2embedding(pdseries, uniquewords_list):
new_series = {}
if EMBED_BY_COUNTING:
for index,row in pdseries.iteritems():
row=row.lower().split(" ")
#use number of occurance instead of boolean
new_series.update({index : [row.count(word) for word in uniquewords_list]})
elif EMBED_BY_PRESENCE:
for index,row in pdseries.iteritems():
row=row.lower().split(" ")
new_series.update({index : [1 if word in row else 0 for word in uniquewords_list ]})
return pd.Series(new_series)
def convert2integerrepresentation(pdseries, category_list):
new_series = {}
for index,row in pdseries.iteritems():
#no lower() operation needed
try:
integer_requried = category_list.index(row)#find the matching value
except ValueError:
integer_requried = UNLABELLED
new_series.update({index:integer_requried})
return pd.Series(new_series)
def splitin2columns(pdseries): #expand vectors stored in a single pdseries (i.e. in a single column) into suitable number of columns
assert type(pdseries) == pd.core.series.Series, "only accept a single column (pd.Series) object"
new_df = []
for vector in pdseries:
new_df.append(vector)
new_df = pd.DataFrame(new_df, index = pdseries.index, columns = [ i for i in range(len(vector)) ])
return new_df #return another DataFrame
#turn the dataframes into dictionary for ease of iteration.
frames = {'tr':tr_frame, 'te':te_frame}
#do the embedding conversion
hybrid_data = {} #combined text from column 1 (description) and 2 (class)
hybrid_data.update({"tr" : tr_frame[1]+" "+ tr_frame[2]})
hybrid_data.update({"te" : te_frame[1]+" "+ te_frame[2]})
uniquewords_tr = get_unique_words(hybrid_data["tr"]) #Use all of the words that appeared in the training set as features.
#save embedding conversion and inversion
save_inversion_dict(uniquewords_tr, data="description_and_class")
save_conversion_dict(uniquewords_tr, data="description_and_class")
#convert all categories into numerical representations.
list_of_set_of_category = get_set_of_categroies(tr_frame[3])
maxlen = len(list_of_set_of_category)
#save
save_inversion_dict(list_of_set_of_category)
save_conversion_dict(list_of_set_of_category)
#populate the dataframes
train_test_feature_label = {}
print("Starting the conversion, please be patient as it can take up to 1 minute.")
for t in ('tr', 'te'):
embedding_representation = convert2embedding(hybrid_data[t], uniquewords_tr)
print("Embedding complete for {0}fea, program run time so far = {1}".format(t, time.time()-start_time))
category_sparse_rep = convert2integerrepresentation(frames[t][3], list_of_set_of_category)
train_test_feature_label.update({t+'fea':splitin2columns(embedding_representation)})
print("Sparse embedding for {0}lab complete, program run time so far = {1}".format(t, time.time()-start_time))
train_test_feature_label.update({t+'lab':category_sparse_rep})
for k,v in train_test_feature_label.items():
with open(k+'.pkl', 'wb') as f:
pickle.dump(v,f)