-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsat.py
153 lines (112 loc) · 4.45 KB
/
sat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 25 17:49:37 2018
@author: Shantam Vijayputra and Zameer Ul Haque
"""
#importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
import pickle
from mlxtend.plotting import plot_learning_curves
from mlxtend.preprocessing import shuffle_arrays_unison
from sklearn import tree
import math
#import math
#creating class model
class sat(object):
def __init__ (self,df=None):
self.df = df
#using classmethod and loading the dataset
@classmethod
def load_data(self,path):
df = pd.read_csv(path);
x = df.iloc[:,0].values.reshape(-1,1)
y = df.iloc[:,1].values.reshape(-1,1)
return x,y
#using classmethod and splitting the training and testing dataset
@classmethod
def split(self,x,y):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
return x_train, x_test, y_train, y_test
#using classmethod and scaling the feautues may or may not be necessary based on quality of the dataset
@classmethod
def scale(self,x_train, x_test, y_train, y_test):
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)
y_test = sc.fit_transform(y_test)
y_train = sc.fit_transform(y_train)
return x_train, x_test, y_train, y_test
#using classmethod and taking care of missing values using imputer
@classmethod
def missing_val(self,x_train):
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(x_train[:, :])
x_train[:, :] = imputer.transform(x_train[:, :])
return x_train
#using classmethod and defining classifier
@classmethod
def classifier(self):
clf = tree.DecisionTreeClassifier()
#clf = LinearRegression()
return clf
#using classifier and plotting the data
# Visualising the Test set results
@classmethod
def plot(self,clf,x,y,color1,color2,title,xlable,ylable):
plt.scatter( x, y, color = color1 )
plt.scatter(x, clf.predict(x), color = color2)
plt.title(title)
plt.xlabel(xlable)
plt.ylabel(ylable)
plt.legend()
return plt.show()
#using classmethod and saving the data
@classmethod
def save_classifier(self,clf,name_with_path,_type):
return pickle.dump(clf, open(name_with_path, _type))
#using classmethod and loading the data
@classmethod
def load_classifier(self,name_with_path,_type):
clf = pickle.load(open(name_with_path,_type))
return clf
#creating main function using classmethod
@classmethod
def main(self):
print(__doc__)
#creating object instances
obj = sat()
#extracting features and output
x,y = obj.load_data("gpa.csv")
#splitting the data
x_train,x_test,y_train,y_test = obj.split(x,y)
#scaling the data
#x_train,x_test,y_train,y_test = obj.scale(x_train,x_test,y_train,y_test)
#missing value imputation
x_train = obj.missing_val(x_train)
x_test = obj.missing_val(x_test)
y_train = obj.missing_val(y_train)
y_test = obj.missing_val(y_test)
#generating classifier
clf = obj.classifier()
#fitting the features into the model
clf.fit(x_train,y_train)
#plotting training set
obj.plot(clf,x_train,y_train,"orange","blue","sat score (Training set)","GPA","SAT SCORE")
#plotting the testing set
obj.plot(clf,x_test,y_test,"orange","blue","sat score (Testing set)","GPA","SAT SCORE")
#saving classifier
obj.save_classifier(clf,"sat_score.pkl","wb")
#loading the data
clf = obj.load_classifier("sat_score.pkl","rb")
x, y = shuffle_arrays_unison(arrays=[x, y], random_seed=5)
plot_learning_curves(x_train, y_train, x, y, clf)
plt.show()
if __name__ == "__main__":
obj1 = sat()
obj1.main()