forked from venu-guntupalli/Rossman
-
Notifications
You must be signed in to change notification settings - Fork 0
/
KAGGLE_FINAL.py
282 lines (223 loc) · 15.3 KB
/
KAGGLE_FINAL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 20 19:30:15 2015
@author: Apple
"""
#import re
#import datetime
import os
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.base import BaseEstimator
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn import decomposition
path = 'C:/Sathya/Subject/Fall-2015/BIA-656/Project/Rossman/'
#If this is 'train' then the rest of the code will run on train, if it is 'test', it will work on test file.
dataset = ['train','test']
for data in dataset:
#Assign the absolute path of the data file to a variable
file_path_train = os.path.abspath(path+'{0}.csv'.format(data))
file_path_store = os.path.abspath(path+'store_new.csv')
#Assign the contents of the datafile to a Pandas dataframe
df_train = pd.read_csv(file_path_train, sep=',', header=0) #Indicate to treat first row as column names
df_store = pd.read_csv(file_path_store, sep=',', header=0) #Indicate to treat first row as column names
##########################################################################################################
print ('START -- Process to convert non-integer labels to integers')
#Categorical variables with non-integer labels mapped to integers
#StateHoliday (in train data set)
#df_train.StateHoliday.unique() = array(['0', 'a', 'b', 'c', 0], dtype=object)
#0 = ('0' or 0) 1 = 'a' 2 = 'b' 3 = 'c' else 999
df_train['StateHoliday'] = df_train['StateHoliday'].apply(lambda value: 0 if (value == '0' or value == 0) else 1 if value == 'a' else 2 if value == 'b' else 3 if value == 'c' else 999)
#Categorical variables with non-integer labels mapped to integers
#StoreType (in store data set)
#df_store.StoreType.unique() = array(['c', 'a', 'd', 'b'], dtype=object)
#1 = 'a' 2 = 'b' 3 = 'c' 4 = 'd' else 999
df_store['StoreType'] = df_store['StoreType'].apply(lambda value: 1 if value == 'a' else 2 if value == 'b' else 3 if value == 'c' else 4 if value == 'd' else 999)
#Categorical variables with non-integer labels mapped to integers
#Assortment (in store data set)
#df_store.Assortment.unique() = array(['a', 'c', 'b'], dtype=object)
#1 = 'a' 2 = 'b' 3 =='c' else 999
df_store['Assortment'] = df_store['Assortment'].apply(lambda value: 1 if value == 'a' else 2 if value == 'b' else 3 if value == 'c' else 999)
#Categorical variables with non-integer labels mapped to integers
#PromoInterval (in store data set)
#df_store.PromoInterval.unique() = array([nan, 'Jan,Apr,Jul,Oct', 'Feb,May,Aug,Nov', 'Mar,Jun,Sept,Dec'], dtype=object)
#1 = 'Jan,Apr,Jul,Oct' 2 = 'Feb,May,Aug,Nov' 3 = 'Mar,Jun,Sept,Dec' else 999
df_store['PromoInterval'] = df_store['PromoInterval'].apply(lambda value: 1 if value == 'Jan,Apr,Jul,Oct' else 2 if value == 'Feb,May,Aug,Nov' else 3 if value == 'Mar,Jun,Sept,Dec' else 999)
print ('COMPLETE -- Process to convert non-integer labels to integers')
##########################################################################################################
#The property of scikit is such that it does not treat categorical variables as categories
#Instead, it will treat integers and floats are ordinal and convert strings into floats
#This results in gibberish model fitting. Therefore it is important to split each categorical variable into multiple columns, with only two categories '1' and '0'
#The following section of the code does this process
#Categorical variable 'DayOfWeek' (in train data set)
#It has seven uniques values: 1 to 7. Creating 7 columns with 1/0 to mean Yes/No
#New Columns: DoW1, DoW2, DoW3, DoW4, DoW5, DoW6, DoW7
df_train['DoW1'] = np.where(df_train['DayOfWeek'] == 1, 1, 0) # Create a new column, impute 1 or 0 based on current value
df_train['DoW2'] = np.where(df_train['DayOfWeek'] == 2, 1, 0)
df_train['DoW3'] = np.where(df_train['DayOfWeek'] == 3, 1, 0)
df_train['DoW4'] = np.where(df_train['DayOfWeek'] == 4, 1, 0)
df_train['DoW5'] = np.where(df_train['DayOfWeek'] == 5, 1, 0)
df_train['DoW6'] = np.where(df_train['DayOfWeek'] == 6, 1, 0)
df_train['DoW7'] = np.where(df_train['DayOfWeek'] == 7, 1, 0)
del df_train['DayOfWeek']
#Categorical variable 'Date' (in train data set)
#This column is split into 53 columns for weeks and 3 year columns with 1/0 to mean Yes/No
#New Columns: W1, W2, W3,... W53 and Y1, Y2, and Y3
df_train['Date_Mod'] = pd.to_datetime(df_train['Date']) #to_datetime(argument): Convert argument to datetime
df_train['Week'] = df_train['Date_Mod'].dt.week #create week column that gives 1-52 values based on week of year
df_train['Year'] = df_train['Date_Mod'].dt.year #create year column that gives 1-3 values based on week of year
#Create 52 columns (for weeks) and fill with appropriate values
for i in range(1,53):
df_train['W{0}'.format(i)] = (df_train['Week'] == i).astype('int')
#Create 3 columns (for years) and fill with appropriate values
for i in range(2013,2016):
df_train['Y{0}'.format(i)] = (df_train['Year'] == i).astype('int')
del df_train['Date']
#del df_train['Date_Mod'] #This is used later to modify another column, so not deleting for now
del df_train['Week']
del df_train['Year']
#Categorical variable 'StoreType' (in store data set)
#This column is split into 4 columns for the 4 types of stores with 1/0 to mean Yes/No
df_store['StoreTypeA'] = (df_store['StoreType'] == 1).astype('int')
df_store['StoreTypeB'] = (df_store['StoreType'] == 2).astype('int')
df_store['StoreTypeC'] = (df_store['StoreType'] == 3).astype('int')
df_store['StoreTypeD'] = (df_store['StoreType'] == 4).astype('int')
del df_store['StoreType']
#Categorical variable 'Assortment' (in store data set)
#This column is split into 3 columns for the 3 types of stores with 1/0 to mean Yes/No
df_store['AssortmentA'] = (df_store['Assortment'] == 1).astype('int')
df_store['AssortmentB'] = (df_store['Assortment'] == 2).astype('int')
df_store['AssortmentC'] = (df_store['Assortment'] == 3).astype('int')
del df_store['Assortment']
#Categorical variable 'StateHoliday' (in store data set)
#This column is split into 4 columns for the 4 types of stores with 1/0 to mean Yes/No
df_train['StateHolidayNO'] = (df_train['StateHoliday'] == 0).astype('int')
df_train['StateHolidayA'] = (df_train['StateHoliday'] == 1).astype('int')
df_train['StateHolidayB'] = (df_train['StateHoliday'] == 2).astype('int')
df_train['StateHolidayC'] = (df_train['StateHoliday'] == 3).astype('int')
del df_train['StateHoliday']
#Imputing missing values in the store dataset column 'CompetitionDistance' with 0
df_store['CompetitionDistance'] = df_store['CompetitionDistance'].fillna(0)
#Columns with missing values
# -> CompetitionDistance
# -> CompetitionOpenSinceMonth
# -> CompetitionOpenSinceYear
# -> Promo2SinceWeek
# -> Promo2SinceYear
#Should we replace all missing values with 999? Not for now.
#Merge train nand store datasets into one master data set - df_master0
#URL: http://pandas.pydata.org/pandas-docs/stable/merging.html
df_master0 = pd.merge(df_train, df_store, how='left', on=['Store'])
#Creating column for CompetitionOpenSinceMonth & CompetitionOpenSinceYear - 'NoOfMonthsSinceCompetition'
#either values exist in both columns or none in either
df_master0['NoOfWeeksSinceCompetition'] = ((df_master0['Date_Mod'].dt.year - df_master0['CompetitionOpenSinceYear']) * 12 + df_master0['Date_Mod'].dt.month - df_master0['CompetitionOpenSinceMonth'])
df_master0['NoOfWeeksSinceCompetition'] = df_master0['NoOfWeeksSinceCompetition'].fillna(0)
df_master0['NoOfWeeksSinceCompetition'] = df_master0['NoOfWeeksSinceCompetition'].apply(lambda value: 0 if value < 0 else value*4)
del df_master0['CompetitionOpenSinceMonth']
del df_master0['CompetitionOpenSinceYear']
#Creating column for Promo2SinceWeek & Promo2SinceYear - 'NoOfWeeksSincePromo2'
#either values exist in both columns or none in either
df_master0['NoOfWeeksSincePromo2'] = ((df_master0['Date_Mod'].dt.year - df_master0['Promo2SinceYear']) * 52 + df_master0['Date_Mod'].dt.week - df_master0['Promo2SinceWeek'])
df_master0['NoOfWeeksSincePromo2'] = df_master0['NoOfWeeksSincePromo2'].fillna(0)
df_master0['NoOfWeeksSincePromo2'] = df_master0['NoOfWeeksSincePromo2'].apply(lambda value: 0 if value < 0 else value)
del df_master0['Promo2SinceWeek']
del df_master0['Promo2SinceYear']
#Create 12 columns (for months) and fill with appropriate values (1 if Promo2 is in this interval 0 otherwise)
for i in range(1,4):
df_master0['PromoIntervalM{0}'.format(i)] = (df_master0['PromoInterval'] == i).astype('int')
df_master0['PromoIntervalJan'] = df_master0['PromoIntervalM1']
df_master0['PromoIntervalApr'] = df_master0['PromoIntervalM1']
df_master0['PromoIntervalJul'] = df_master0['PromoIntervalM1']
df_master0['PromoIntervalOct'] = df_master0['PromoIntervalM1']
df_master0['PromoIntervalFeb'] = df_master0['PromoIntervalM2']
df_master0['PromoIntervalMay'] = df_master0['PromoIntervalM2']
df_master0['PromoIntervalAug'] = df_master0['PromoIntervalM2']
df_master0['PromoIntervalNov'] = df_master0['PromoIntervalM2']
df_master0['PromoIntervalMar'] = df_master0['PromoIntervalM3']
df_master0['PromoIntervalJun'] = df_master0['PromoIntervalM3']
df_master0['PromoIntervalSept'] = df_master0['PromoIntervalM3']
df_master0['PromoIntervalDec'] = df_master0['PromoIntervalM3']
del df_master0['PromoInterval']
del df_master0['PromoIntervalM1']
del df_master0['PromoIntervalM2']
del df_master0['PromoIntervalM3']
del df_master0['Date_Mod']
# df_master0['AvgSalesperStore'] = df_master0['Sales'].groupby(df_master0['Store'])
df_master0['Open'] = df_master0['Open'].fillna(0)
#Write df_master0 to CSV
df_master0.to_csv('master{0}.csv'.format(data), sep=',', index=False, encoding='utf-8')
'''
unique_stores = list()
unique_stores = df_master0.CompetitionDistance.unique()
s2 = df_master0['Sales'].groupby(df_master0['Date']).sum()
'''
'''
#Create a dictionary of what the variables are and what there description is
master_dict = {
"Id": "This represents an identifier for a store on a given particular date.",
"Store": "a unique Id for each store. This will be a categorical variable. There are 1115 unique Ids, representing the same number of stores.",
"Sales": "the turnover for any given day (Target Variable).",
"Customers": "the number of customers on a given day",
"Open": "an indicator for whether the store was open: Levels: 2 (0 = closed, 1 = open)",
"StateHoliday": "indicates a state holiday: Levels: 4 (a = public holiday, b = Easter holiday, c = Christmas, 0 = None \n Variables after mapping: 0 = ('0' or 0) 1 = 'a' 2 = 'b' 3 = 'c' else 999)",
"SchoolHoliday": "indicates if a Store on a given Date was affected by the closure of public schools: Levels: 2 (0 = Closed, 1 = Open)",
"StoreType": "differentiates between 4 different store models: Levels: 4 (a, b, c, d) \n 1 = 'a' 2 = 'b' 3 = 'c' 4 = 'd' else 999",
"Assortment": "describes an assortment level: Levels: 3 (a = basic, b = extra, c = extended) \n 1 = 'a' 2 = 'b' 3 =='c' else 999",
"CompetitionDistance": "distance in meters to the nearest competitor store",
"CompetitionOpenSinceMonth": "gives the approximate month of the time the nearest competitor was opened",
"CompetitionOpenSinceYear": "gives the year of the time the nearest competitor was opened",
"Promo": "indicates whether a store is running a promo on that day",
"Promo2": "Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating",
"Promo2SinceWeek": "describes the calendar week when the store started participating in Promo2",
"Promo2SinceYear": "describes the calendar year when the store started participating in Promo2",
"PromoInterval": "describes the consecutive intervals when Promo2 is started, naming the months the promotion is started anew. E.g. 'Feb,May,Aug,Nov' means each round starts in February, May, August, November of any given year for that store \n 1 = 'Jan,Apr,Jul,Oct' 2 = 'Feb,May,Aug,Nov' 3 = 'Mar,Jun,Sept,Dec' else 999",
"DayOfWeek": "= array([5, 4, 3, 2, 1, 7, 6], dtype=int64)",
"Date": "YOU KNOW WHAT IT IS"
}
#Iterate through each column in the dataset and ouput high-level analysis results for each
for number,each_column in enumerate(df_master0.columns.values):
print ('-----------------------------------')
print (' '+str(number+1)+'. '+each_column)
#print ('About this variable: ' + master_dict[each_column])
print ('Basic information about this variable:')
print ('')
print (df_master0[each_column].describe())
print ('Number of uniques: '+str(len(df_master0[each_column].unique())))
#Output a barchart of frequencies for each category in all the variables in the dataset
i = df_master0[each_column].unique() #array of all uniques in the variable
j=list()
for each_unique in df_master0[each_column].unique():
j = j + [len(df_master0[df_master0[each_column] == each_unique])] #list of counts corresponding to all uniques in the variable
#Plot Categories (uniques) vs. respective counts for each variable
sns.axlabel("Categories","Count")
plt.title(each_column)
sns.barplot(i,j)
plt.show()
#Plot paiwise plot for all variables
#sns.pairplot(df_master[['income', 'firstDate', 'lastDate', 'amount', 'freqSales', 'saleSizeCode', 'starCustomer', 'lastSale', 'avrSale', 'class']],hue="class")
#plt.show()
Corelation matrix among all variables in the data set
'''
if data == 'train':
print (df_master0.corr()['Sales'])
else:
print (df_master0.info())