-
Notifications
You must be signed in to change notification settings - Fork 0
/
knnClass.py
144 lines (104 loc) · 4.24 KB
/
knnClass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 9 17:23:53 2018
@author: sinan
"""
import numpy as np
import random
import matplotlib.pyplot as plt
import scipy.stats as ss
def distance(p1, p2):
"""Finds the distance between two points"""
return np.sqrt(np.sum(np.power(p2-p1,2)))
p1 = np.array([1,1])
p2 = np.array([2,3])
print(distance(p1, p2))
def majority_vote(votes):
vote_counts = {}
for vote in votes:
if vote in vote_counts:
vote_counts[vote] += 1
else:
vote_counts[vote] = 1
winners = []
max_count = max(vote_counts.values())
for vote, count in vote_counts.items():
if vote_counts[vote] == max_count:
winners.append(vote)
return random.choice(winners)
votes = [1,1,1,2,2,3,3,1,1,3,3,3,4,5,6]
winner = majority_vote(votes)
points = np.array([[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3]])
p = np.array([2.5,2])
plt.plot(points[:,0], points[:,1], 'bo')
plt.plot(p[0], p[1], 'r*')
plt.show()
def find_nearest_neighbors(p, points, k=2):
distances = np.zeros(points.shape[0])
for i in range(len(distances)):
distances[i] = distance(p, points[i])
ind = np.argsort(distances)
return ind[:k]
def knn_predict(p, points, outcomes, k=5):
ind = find_nearest_neighbors(p, points, k)
return majority_vote(outcomes[ind])
outcomes = np.array([0,0,0,0,1,1,1,1,1])
def generate_synthetic_data(n=50):
"""Create two sets of points from bivariate normal distribution"""
points = np.concatenate((ss.norm(0,1).rvs((n,2)), ss.norm(1,1).rvs((n,2))), axis=0)
outcomes = np.concatenate((np.repeat(0,n), np.repeat(1,n)), axis=0)
return (points, outcomes)
n = 20
(points, outcomes) = generate_synthetic_data(n)
plt.figure()
plt.plot(points[:n,0], points[:n,1], "bo")
plt.plot(points[n:,0], points[n:,1], "ro")
plt.show()
def make_prediction_grid(predictors, outcomes, limits, h, k):
"""xx"""
(x_min, x_max, y_min, y_max) = limits
xs = np.arange(x_min, x_max, h)
ys = np.arange(y_min, y_max, h)
xx, yy = np.meshgrid(xs, ys)
prediction_grid = np.zeros(xx.shape, dtype=int)
for i,x in enumerate(xs):
for j,y in enumerate(ys):
p = np.array([x,y])
prediction_grid[j,i] = knn_predict(p, predictors, outcomes, k)
return (xx, yy, prediction_grid)
def plot_prediction_grid (xx, yy, prediction_grid, filename):
""" Plot KNN predictions for every point on the grid."""
from matplotlib.colors import ListedColormap
background_colormap = ListedColormap (["hotpink","lightskyblue", "yellowgreen"])
observation_colormap = ListedColormap (["red","blue","green"])
plt.figure(figsize =(7,7))
plt.pcolormesh(xx, yy, prediction_grid, cmap = background_colormap, alpha = 0.5)
plt.scatter(predictors[:,0], predictors [:,1], c = outcomes, cmap = observation_colormap, s = 50)
plt.xlabel('Variable 1'); plt.ylabel('Variable 2')
plt.xticks(()); plt.yticks(())
plt.xlim (np.min(xx), np.max(xx))
plt.ylim (np.min(yy), np.max(yy))
plt.savefig(filename)
predictors, outcomes = generate_synthetic_data()
k = 25; filename = 'knn_predict_5.pdf'; limits = (-3,4,-3,4); h = 0.1;
(xx, yy, prediction_grid) = make_prediction_grid(predictors, outcomes, limits, h, k)
plot_prediction_grid(xx, yy, prediction_grid, filename)
from sklearn import datasets
iris = datasets.load_iris()
predictors = iris.data[:,0:2]
outcomes = iris.target
plt.plot(predictors[outcomes==0][:,0], predictors[outcomes==0][:,1], "ro")
plt.plot(predictors[outcomes==1][:,0], predictors[outcomes==1][:,1], "bo")
plt.plot(predictors[outcomes==2][:,0], predictors[outcomes==2][:,1], "go")
plt.show()
k = 5; filename = 'iris_grid.pdf'; limits = (4,8,1.5,4.5); h = 0.1;
(xx, yy, prediction_grid) = make_prediction_grid(predictors, outcomes, limits, h, k)
plot_prediction_grid(xx, yy, prediction_grid, filename)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(predictors, outcomes)
sk_predictions = knn.predict(predictors)
my_predictions = np.array([knn_predict(p, predictors, outcomes, k=5) for p in predictors])
print(np.mean(sk_predictions==outcomes))
print(np.mean(my_predictions==outcomes))