-
Notifications
You must be signed in to change notification settings - Fork 0
/
Regression.py
139 lines (125 loc) · 4.3 KB
/
Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
import numpy as np
import scipy.spatial as sc
import pandas as pd
import time
np.random.seed()
def Fold(length,fold):
inds = np.arange(length)
np.random.shuffle(inds)
result = []
step = int(length/fold)
for i in range(fold):
train = np.hstack((inds[:i*step],inds[(i+1)*step:]))
test = inds[i*step:(i+1)*step]
result.append((train,test))
return result
def MSE(y,y_true):
return np.sum((y - y_true)**2)/len(y)
class Regressor:
scale = None
model = None
theta = None
h = 0
kernel = None
fold = 0
X_tr = None
y_tr = None
def __init__(self, model='lin',kernel=None,fold=5):
self.model = model
self.fold = fold
if kernel == 'boxcar':
self.kernel = lambda x : (x <= 1 and x >= 0)
elif kernel == 'gauss':
self.kernel = lambda x : np.sqrt(2/np.pi)*np.exp(-(x**2) / 2)
elif kernel == 'epanech':
self.kernel = lambda x : 3/2 * (1 - x**2) * (x <= 1 and x >= 0)
def fit(self,X,y):
if len(X.shape) == 1:
X = np.array([[x] for x in X])
self.scale = np.max(X,axis = 0)
X = X / self.scale
if self.model == 'lin':
self.fit_lin(X,y)
elif self.model == 'NW' or self.model == 'loclin':
self.fit_NW(X,y)
def predict(self,X, h=None):
if len(X.shape) == 1:
X = np.array([[x] for x in X])
if h is None:
h = self.h
X = X / self.scale
if self.model == 'lin':
return np.dot(np.hstack((X,np.ones((X.shape[0],1)))),self.theta)
elif self.model == 'NW':
kernels = lambda x,y,h : self.kernel(sc.distance.euclidean(x,y)/h)
predict_one = lambda x : np.sum(
np.apply_along_axis(kernels,1,self.X_tr,*(x,h))*self.y_tr)/np.sum(
np.apply_along_axis(kernels,1,self.X_tr,*(x,h)))
return np.invert(np.isnan(np.apply_along_axis(predict_one,1,X)))*np.apply_along_axis(predict_one,1,X)
elif self.model == 'loclin':
X_ = np.hstack((self.X_tr,np.ones((self.X_tr.shape[0],1))))
kernels = lambda x,y,h : self.kernel(sc.distance.euclidean(x,y)/h)
weights = lambda x : np.apply_along_axis(kernels,1,self.X_tr,*(x,h))
theta = lambda x : np.linalg.lstsq(
np.dot(X_.T*weights(x),X_),
np.dot(X_.T*weights(x),self.y_tr), None)[0]
predict_one = lambda x : np.dot(np.hstack((x,np.ones(1))), theta(x))
return np.apply_along_axis(predict_one,1,X)
def fit_lin(self,X,y):
X_ = np.hstack((X,np.ones((X.shape[0],1))))
self.theta = np.linalg.lstsq(np.dot(X_.T,X_),np.dot(X_.T,y),None)[0]
def fit_NW(self,X,y):
self.h = (10/X.shape[0])**(1/X.shape[1])
self.X_tr = X
self.y_tr = y
data = pd.read_csv('winequality-red.csv')
X = np.array(data.drop(columns=['quality']))
y = np.array(data['quality'],dtype=float)
models = ['NW','loclin']
kernels = ['boxcar','gauss','epanech']
errors = []
times_fit = []
times_predict = []
N = 5
kaberne = Regressor(model='lin')
error = 0.
time_f = 0.
time_p = 0.
for train,test in Fold(X.shape[0],N):
t1 = time.time()
kaberne.fit(X[train],y[train])
t2 = time.time()
y_ = kaberne.predict(X[test])
t3 = time.time()
error += MSE(y_,y[test])
time_f += (t2-t1)
time_p += (t3-t2)
error /= N
time_f /= N
time_p /= N
errors.append(error)
times_fit.append(time_f)
times_predict.append(time_p)
for model in models:
for kernel in kernels:
kaberne = Regressor(model=model,kernel=kernel)
error = 0.
time_f = 0.
time_p = 0.
for train,test in Fold(X.shape[0],N):
t1 = time.time()
kaberne.fit(X[train],y[train])
t2 = time.time()
y_ = kaberne.predict(X[test])
t3 = time.time()
error += MSE(y_,y[test])
time_f += (t2-t1)
time_p += (t3-t2)
error /= N
time_f /= N
time_p /= N
errors.append(error)
times_fit.append(time_f)
times_predict.append(time_p)
print(errors,times_fit,times_predict)