Embed.py

from numpy import cos, arccos, sin, arctan, tan, pi, sqrt; from numpy import array as ary; import numpy as np; tau = 2*pi
import pandas as pd
import pickle
import time

UNLABELLED = -1
EMBED_BY_COUNTING = True
start_time = time.time()
try:
    import os, sys
    os.chdir(sys.argv[1])
    if len(sys.argv[2:])>0:
        exec(''.join(sys.argv[2:]))
except IndexError:
    print("usage:")
    print("'python "+sys.argv[0]+" <folder containing train_data.csv and test_data.csv>/'")
    print("They can be automatically generated by Shuffle.py")
    sys.exit()
try:
    tr_frame = pd.read_csv("train_data.csv", header=None, index_col=0).fillna('')
    te_frame = pd.read_csv("test_data.csv", header=None, index_col=0).fillna('')
    if not 3 in te_frame.columns: #if column 3 is entirely unfilled:
        te_frame[3]=''
except FileNotFoundError:
    print("Please preprocess that directory with 'Shuffle.py' first;")
    print("Or, alternatively, save the 'train_data.csv' and 'test_data.csv' directly.")
    exit()

EMBED_BY_PRESENCE = not EMBED_BY_COUNTING

def get_unique_words(descriptions):
    allwords = " ".join(list(descriptions))   #join them into a single string
    allwords = allwords.lower().split(" ") #change all words to lower case to minimize duplications
    uniquewords = list(set(allwords))  #turn it into an iterable set
    uniquewords.sort()
    uniquewords = [ i for i in uniquewords if (i!="" and i.isprintable())] #remove the empty string character from the set
    print("{0} unique words is found and used as the features for learning.".format(len(uniquewords)))
    return uniquewords #a list of all unique words

def save_inversion_dict(set_as_list, data="category"):
    allcombo_dict = dict([ [ i,set_as_list[i] ] for i in range(len(set_as_list))]) #convert to a dictionary, which can be saved.
    with open("inversion_"+data+".pkl","bw") as f:
        pickle.dump(allcombo_dict, f)
    return

def save_conversion_dict(set_as_list, data="category"):
    allcombo_dict = dict([ [set_as_list[i], i]  for i in range(len(set_as_list)) ])
    with open("conversion_"+data+".pkl","bw") as f:
        pickle.dump(allcombo_dict, f)
    return    

def get_set_of_categroies(pdseries):
    s = set(pdseries)
    s.discard('')
    list_of_set_of_category = list(s)
    list_of_set_of_category.sort()
    return list_of_set_of_category

def convert2embedding(pdseries, uniquewords_list):
    new_series = {}
    if EMBED_BY_COUNTING:
        for index,row in pdseries.iteritems():
            row=row.lower().split(" ")
            #use number of occurance instead of boolean
            new_series.update({index : [row.count(word) for word in uniquewords_list]})
    elif EMBED_BY_PRESENCE:
        for index,row in pdseries.iteritems():
            row=row.lower().split(" ")
            new_series.update({index : [1 if word in row else 0 for word in uniquewords_list ]})
    return pd.Series(new_series)

def convert2integerrepresentation(pdseries, category_list):
    new_series = {}
    for index,row in pdseries.iteritems():
        #no lower() operation needed
        try:
            integer_requried = category_list.index(row)#find the matching value
        except ValueError:
            integer_requried = UNLABELLED
        new_series.update({index:integer_requried})
    return pd.Series(new_series)


def splitin2columns(pdseries): #expand vectors stored in a single pdseries (i.e. in a single column) into suitable number of columns
    assert type(pdseries) == pd.core.series.Series, "only accept a single column (pd.Series) object"
    new_df = []
    for vector in pdseries:
        new_df.append(vector)
    new_df = pd.DataFrame(new_df, index = pdseries.index, columns = [ i for i in range(len(vector)) ])
    return new_df #return another DataFrame

#turn the dataframes into dictionary for ease of iteration.
frames = {'tr':tr_frame, 'te':te_frame}

#do the embedding conversion
hybrid_data = {} #combined text from column 1 (description) and 2 (class)
hybrid_data.update({"tr" : tr_frame[1]+" "+ tr_frame[2]})
hybrid_data.update({"te" : te_frame[1]+" "+ te_frame[2]})
uniquewords_tr = get_unique_words(hybrid_data["tr"]) #Use all of the words that appeared in the training set as features.
#save embedding conversion and inversion
save_inversion_dict(uniquewords_tr, data="description_and_class")
save_conversion_dict(uniquewords_tr, data="description_and_class")

#convert all categories into numerical representations.
list_of_set_of_category = get_set_of_categroies(tr_frame[3])
maxlen = len(list_of_set_of_category)
#save
save_inversion_dict(list_of_set_of_category)
save_conversion_dict(list_of_set_of_category)

#populate the dataframes
train_test_feature_label = {}
print("Starting the conversion, please be patient as it can take up to 1 minute.")
for t in ('tr', 'te'):
    embedding_representation = convert2embedding(hybrid_data[t], uniquewords_tr)
    print("Embedding complete for {0}fea,        program run time so far = {1}".format(t, time.time()-start_time))
    category_sparse_rep = convert2integerrepresentation(frames[t][3], list_of_set_of_category)
    train_test_feature_label.update({t+'fea':splitin2columns(embedding_representation)})
    print("Sparse embedding for {0}lab complete, program run time so far = {1}".format(t, time.time()-start_time))
    train_test_feature_label.update({t+'lab':category_sparse_rep})

for k,v in train_test_feature_label.items():
    with open(k+'.pkl', 'wb') as f:
        pickle.dump(v,f)