-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewassignment1ml.py
160 lines (118 loc) · 3.92 KB
/
newassignment1ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""NewAssignment1ML.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1UKxDMN0OiKku9ESwq-dNnH9-f_CYYGIc
"""
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
# intialize the sclaer and encoder
label_encoder = LabelEncoder()
scaler = StandardScaler()
# fetch dataset
abalone = fetch_ucirepo(id=1)
# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets
# metadata
print(abalone.metadata)
# variable information
print(abalone.variables)
#Check what feature's data we have
print(X)
#Check what target's data we have
print(y)
#plot all features to show any outliers
sns.boxplot(data=X)
plt.show()
#Check to see if there are any null items
X.isnull().sum()
#Drop any duplicate row data in feature
X.drop_duplicates()
#Check to see what are the top data for each feature
X.head()
#Encode Sex to be use to for data manipulation
X['SexNo'] = label_encoder.fit_transform(X['Sex'])
#Check what feaautes we have and what each type is
X.dtypes
#Drop any null or n/a values we have in features
X.dropna()
#Check again for any null or n/a values we have in features
X.isnull().sum()
#Drop any null or n/a values we have in target
y.dropna()
#Check again for any null or n/a values we have in target
y.isnull().sum()
#Drop the "Sex" feature
X = X.drop(['Sex'], axis=1)
#Check current info about our features
X.info()
#Correlate all the data from the features
correlation_matrix = X.corr().round(2)
#Map all the correlations to visually see it
sns.heatmap(data=correlation_matrix, annot=True)
#Drop "SexNo" feature due to little correlation with the other data
X = X.drop(['SexNo'], axis=1)
#Generate colleration again with the SexNo feature removed
correlation_matrix = X.corr().round(2)
#Generate the visual correlization heatmap again without SexNo
sns.heatmap(data=correlation_matrix, annot=True)
#Standardize/Scaled our Feature's data
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
print(X)
#Double check the feautes of X_Scaled
X_scaled.info()
#Convert our target to numpy data
y = y.to_numpy()
print(y)
#Check the original X's statistical data
X.describe()
#Double check our X_scaled statistical data to compare with the original X's statistical data
X_scaled.describe()
#Made sure that y is just one array
y = y.ravel()
#Split our preprocessed data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=50)
#Double check our training and testing data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
#Set values to our hyper-parameters so we can change easily
etaVar = 0.01
maxIterVar = 100000
tolVar = 0.0001
learningRateVar = "invscaling"
#Use SGDRegressor to run our ML Regression data
model = SGDRegressor(eta0 = etaVar, max_iter = maxIterVar, tol = tolVar, learning_rate = learningRateVar, loss='squared_error', penalty='l2')
model.fit(X_train, y_train)
#Try to find our weights for our linear equation
model.coef_
#Find our y-intercenpt
model.intercept_
#Calculate our MSE, MAE, EV, and R2
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
#Print those evalulation metrics
print(mse)
print(mae)
print(ev)
print(r2)
#Get a full generation of all the statistics in a summary chart
mod = sm.OLS(y_train, X_train)
res = mod.fit()
print(res.summary())