newassignment1ml.py

# -*- coding: utf-8 -*-
"""NewAssignment1ML.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1UKxDMN0OiKku9ESwq-dNnH9-f_CYYGIc
"""

from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# intialize the sclaer and encoder
label_encoder = LabelEncoder()
scaler = StandardScaler()

# fetch dataset
abalone = fetch_ucirepo(id=1)

# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets

# metadata
print(abalone.metadata)

# variable information
print(abalone.variables)

#Check what feature's data we have
print(X)

#Check what target's data we have
print(y)

#plot all features to show any outliers
sns.boxplot(data=X)
plt.show()

#Check to see if there are any null items
X.isnull().sum()

#Drop any duplicate row data in feature
X.drop_duplicates()

#Check to see what are the top data for each feature
X.head()

#Encode Sex to be use to for data manipulation
X['SexNo'] = label_encoder.fit_transform(X['Sex'])

#Check what feaautes we have and what each type is
X.dtypes

#Drop any null or n/a values we have in features
X.dropna()

#Check again for any null or n/a values we have in features
X.isnull().sum()

#Drop any null or n/a values we have in target
y.dropna()

#Check again for any null or n/a values we have in target
y.isnull().sum()

#Drop the "Sex" feature
X = X.drop(['Sex'], axis=1)

#Check current info about our features
X.info()

#Correlate all the data from the features
correlation_matrix = X.corr().round(2)

#Map all the correlations to visually see it
sns.heatmap(data=correlation_matrix, annot=True)

#Drop "SexNo" feature due to little correlation with the other data
X = X.drop(['SexNo'], axis=1)

#Generate colleration again with the SexNo feature removed
correlation_matrix = X.corr().round(2)

#Generate the visual correlization heatmap again without SexNo
sns.heatmap(data=correlation_matrix, annot=True)

#Standardize/Scaled our Feature's data
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)
print(X)

#Double check the feautes of X_Scaled
X_scaled.info()

#Convert our target to numpy data
y = y.to_numpy()
print(y)

#Check the original X's statistical data
X.describe()

#Double check our X_scaled statistical data to compare with the original X's statistical data
X_scaled.describe()

#Made sure that y is just one array
y = y.ravel()

#Split our preprocessed data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=50)

#Double check our training and testing data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#Set values to our hyper-parameters so we can change easily
etaVar = 0.01
maxIterVar = 100000
tolVar = 0.0001
learningRateVar = "invscaling"

#Use SGDRegressor to run our ML Regression data
model = SGDRegressor(eta0 = etaVar, max_iter = maxIterVar, tol = tolVar, learning_rate = learningRateVar, loss='squared_error', penalty='l2')
model.fit(X_train, y_train)

#Try to find our weights for our linear equation
model.coef_

#Find our y-intercenpt
model.intercept_

#Calculate our MSE, MAE, EV, and R2
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#Print those evalulation metrics
print(mse)
print(mae)
print(ev)
print(r2)

#Get a full generation of all the statistics in a summary chart
mod = sm.OLS(y_train, X_train)
res = mod.fit()
print(res.summary())