Skip to content

Commit

Permalink
code update
Browse files Browse the repository at this point in the history
  • Loading branch information
Akash-Tandale001 committed Nov 8, 2022
1 parent faa49d5 commit 1b2da67
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 15 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
data.csv
data.csv
preprocessedR.csv
visualization.csv
__pycache__
61 changes: 61 additions & 0 deletions DataProcessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def processData():
rawData = pd.read_csv('./data.csv')

#data preprocessing
#removing column 1 and 2(making InfoData)
#rawData1_=rawData.iloc[:100,:]
#rawData2_=rawData.iloc[-100:,:]
#rawData=pd.concat([rawData1_, rawData2_], ignore_index=True)
infoData = pd.DataFrame()
infoData['FLAG'] = rawData['FLAG']
infoData['CONS_NO'] = rawData['CONS_NO']
data = rawData.drop(['FLAG', 'CONS_NO'], axis=1) #axis 1 column ,axis 0 row

#droping duplicate row
dropIndex = data[data.duplicated()].index # duplicates drop
data = data.drop(dropIndex, axis=0) #droping duplicate value present wen two row are same
infoData = infoData.drop(dropIndex, axis=0) #droping duplicate index infodata

#removing row with all zero(Nan) value
zeroIndex = data[(data.sum(axis=1) == 0)].index # zero rows drop
data = data.drop(zeroIndex, axis=0)
infoData = infoData.drop(zeroIndex, axis=0)

#change column name to dates(2014/1/1 to 2014-01-01)
data.columns = pd.to_datetime(data.columns) #columns reindexing according to dates

#sort data accoding to date( as previusoly column are unsorted)
data = data.reindex(sorted(data.columns), axis=1)
cols = data.columns

# reindex row name (as some row has been remove till this step due to duplicate or all nan values)
data.reset_index(inplace=True, drop=True) # index sorting
infoData.reset_index(inplace=True, drop=True)

#filling nan value using neighbouring value (middle missing value replace by average
#and other by maximum 2 distance element)
data = data.interpolate(method='linear', limit=2, limit_direction='both', axis=0).fillna(0)


#removing erronoues value(fixing outliers)
for i in range(data.shape[0]): # outliers treatment
m = data.loc[i].mean()
st = data.loc[i].std()
data.loc[i] = data.loc[i].mask(data.loc[i] > (m + 3 * st), other=m + 3 * st)

# save preprocessed data without scaling
data.to_csv(r'visualization.csv', index=False, header=True) # preprocessed data without scaling

#noramalisation process
scale = MinMaxScaler()
scaled = scale.fit_transform(data.values.T).T
mData = pd.DataFrame(data=scaled, columns=data.columns)
preprData = pd.concat([infoData, mData], axis=1, sort=False) # Back to initial format
#print("Noramalised data")
#print(preprData)

# save preprocessed data after scaling
preprData.to_csv(r'preprocessedR.csv', index=False, header=True)
23 changes: 11 additions & 12 deletions Train.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def ANN(X_train, X_test, y_train, y_test):

# model.fit(X_train, y_train, validation_split=0, epochs=i, shuffle=True, verbose=0)
model.fit(X_train, y_train, validation_split=0, epochs=epochs_number, shuffle=True, verbose=1)
prediction = model.predict_classes(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
model.summary()
results(y_test, prediction)

Expand All @@ -112,9 +112,8 @@ def CNN1D(X_train, X_test, y_train, y_test):
optimizer='adam',
metrics=['accuracy'])

# model.fit(X_train, y_train, epochs=1, validation_split=0.1, shuffle=False, verbose=1)
model.fit(X_train, y_train, epochs=epochs_number, validation_split=0, shuffle=False, verbose=1)
prediction = model.predict_classes(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
model.summary()
results(y_test, prediction)

Expand All @@ -133,15 +132,15 @@ def LR(X_train, X_test, y_train, y_test):
'''
model = LogisticRegression(C=1000, max_iter=1000, n_jobs=-1, solver='newton-cg')
model.fit(X_train, y_train)
prediction = model.predict(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
results(y_test, prediction)


def DT(X_train, X_test, y_train, y_test):
print('Decision Tree:')
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
results(y_test, prediction)


Expand All @@ -159,14 +158,14 @@ def RF(X_train, X_test, y_train, y_test):
model = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='auto', # max_depth=10,
random_state=0, n_jobs=-1)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
results(y_test, prediction)


def SVM(X_train, X_test, y_train, y_test):
model = SVC(random_state=0)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
prediction = (model.predict(X_test) > 0.5).astype("int32")
results(y_test, prediction)


Expand All @@ -175,11 +174,11 @@ def processTraining():
X_train, X_test, y_train, y_test = read_data()

# Uncomment any model to test
ANN(X_train, X_test, y_train, y_test)
#ANN(X_train, X_test, y_train, y_test)
CNN1D(X_train, X_test, y_train, y_test)
RF(X_train, X_test, y_train, y_test)
LR(X_train, X_test, y_train, y_test)
DT(X_train, X_test, y_train, y_test)
SVM(X_train, X_test, y_train, y_test)
#RF(X_train, X_test, y_train, y_test)
#LR(X_train, X_test, y_train, y_test)
#DT(X_train, X_test, y_train, y_test)
#SVM(X_train, X_test, y_train, y_test)


4 changes: 2 additions & 2 deletions Visualisation2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ def visualisation2D():
for i in range(59, 83, 7):
axs4[0].plot(data.iloc[1,i:i + 7].to_numpy(), marker='>', linestyle='-',
label='$week {i}$'.format(i=(i % 58) % 6))
#xs4[0].legend(loc='best')
axs4[0].legend(loc='best')
axs4[0].set_title('With Fraud', fontsize=14)
axs4[0].set_ylabel('Consumption')
axs4[0].grid(True)

for i in range(59, 83, 7):
axs4[1].plot(data.iloc[6,i:i + 7].to_numpy(), marker='>', linestyle='-',
label='$week {i}$'.format(i=(i % 58) % 6))
#xs4[1].legend(loc='best')
axs4[1].legend(loc='best')
axs4[1].set_title('Without fraud' , fontsize=14)
axs4[1].set_ylabel('Consumption')
axs4[1].grid(True)
Expand Down

0 comments on commit 1b2da67

Please sign in to comment.