-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaner.py
129 lines (109 loc) · 4.26 KB
/
data_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
'''
data_cleaner.py
'''
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def read_search_strings(file_path='../search_strings.csv'):
'''
Reads from csv from file_path
:return: pandas DataFrame of the csv
'''
df = pd.read_csv(file_path, header=0, sep=',', encoding='latin1')
return df
def cleanup_categoryid(df):
'''
Assigns new category id starting from 1.
** This function modifies df **
:return: dictionary[key] = categroyId
'''
i = -1
category_dict = dict()
for j, row in df.iterrows():
category = row[3]
if not category in category_dict.keys():
i += 1
category_dict[category] = i
df.at[j, 'categoryId'] = i
else:
df.at[j, 'categoryId'] = i
result = [df, category_dict]
return result
def clean_item_data(m):
#read in file using data_cleaner
df = read_search_strings()
'''
removing . and all non-alphanumeric characters at the end of each word (e.g. 'oz.')
and preventing the removing of '7.5mm', except ", ', and space.
Æ stays, might remove later
'''
for index, row in df.iterrows():
new_string = ''
if m == 1:
for item in row['item_title']:
item = ''.join(c for c in item if c.isalnum() or c == '\"' or c == '\'' or c == ' ' or c == '.' or c == '$')
new_string += item
word_list = new_string.split()
new_word = ''
for w in word_list:
if w.endswith('.'):
new_word += w[:-1] + ' '
else:
new_word += w + ' '
new_string = new_word
df.at[index, 'item_title']= new_string
elif m == 0:
for item in row['item_title']:
item = ''.join(c.lower() for c in item if c.isalpha() or c == ' ')
new_string += item
word_list = new_string.split()
new_word = ''
for w in word_list:
new_word += w + ' '
new_string = new_word
df.at[index, 'item_title'] = new_string
return df
def data_split(df, train=0.65, valid=0.15, test=0.20):
"""
split data into training, validation, and test sets
:param df: the data set
:param train: percentage of training data
:param valid: percentage of validation data
:param test: percentage of test data
:return: X_train, X_valid, X_test, Y_train, Y_valid, Y_test
"""
# instantiate variables
# column_headers = list(df.columns.values)
if train == 1:
x_df = df['item_title']
y_df = df['categoryId']
return x_df, y_df
X_train = pd.DataFrame()
X_valid = pd.DataFrame()
X_test = pd.DataFrame()
Y_train = pd.DataFrame()
Y_valid = pd.DataFrame()
Y_test = pd.DataFrame()
id_num = df['categoryId'].nunique()
for i in range(1, id_num+1):
x_category_df = df.loc[df['categoryId'] == i]['item_title']
y_category_df = df.loc[df['categoryId'] == i]['categoryId']
x_category_train_valid, x_category_test, y_category_train_valid, y_category_test = \
train_test_split(x_category_df, y_category_df, test_size=test)
if valid != 0:
x_category_train, x_category_valid, y_category_train, y_category_valid = \
train_test_split(x_category_train_valid, y_category_train_valid, train_size=train/(train+valid))
X_train = pd.concat([X_train, x_category_train], axis=0)
X_valid = pd.concat([X_valid, x_category_valid], axis=0)
X_test = pd.concat([X_test, x_category_test], axis=0)
Y_train = pd.concat([Y_train, y_category_train], axis=0)
Y_valid = pd.concat([Y_valid, y_category_valid], axis=0)
Y_test = pd.concat([Y_test, y_category_test], axis=0)
else:
X_train = pd.concat([X_train, x_category_train_valid], axis=0)
X_test = pd.concat([X_test, x_category_test], axis=0)
Y_train = pd.concat([Y_train, y_category_train_valid], axis=0)
Y_test = pd.concat([Y_test, y_category_test], axis=0)
return X_train, X_valid, X_test, Y_train, Y_valid, Y_test
if __name__ == '__main__':
print(clean_item_data(0))