-
Notifications
You must be signed in to change notification settings - Fork 0
/
KMeans.py
120 lines (104 loc) · 3.82 KB
/
KMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import random
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
class KMeans(object):
def __init__(self, iterations, tol):
# self.k = k
self.iterations = iterations
self.tol = tol
self.clustering = {}
"""
Initializing centers randomly
"""
def initial_centers(self, data, k):
centers = np.zeros((k, data.shape[1]))
while True:
# get random int numbers with range of size of data
rand_num = [random.randint(0, data.shape[0]-1) for i in range(k)]
# check if the number of the random centers are equal to k
if len(set(rand_num)) == k:
for i in range(k):
centers[i] = data[rand_num[i]]
break
return centers
"""
The implementation of K means algorithm
"""
def algorithm(self, data, k):
centers = self.initial_centers(data, k)
for i in range(self.iterations):
self.clustering = {}
# Create a k dictionary of list which will contain
# the data for every cluster
for j in range(k):
self.clustering[j] = []
for feature in data:
# get the k Euclidean distances for each center
distances = [np.linalg.norm(feature - c)**2 for c in centers]
# get the min distance from center
min_dist = distances.index(min(distances))
# adding in nearest cluster
self.clustering[min_dist].append(feature)
previous_centers = np.array(centers)
for clf in self.clustering:
"""
update centers according with mean of features
for each cluster
"""
centers[clf] = np.average(self.clustering[clf], axis=0)
if self.stop_iteration(previous_centers, centers):
# print(i)
for j in range(k):
self.clustering[j] = np.array(self.clustering[j])
break
return centers, self.clustering
"""
Stop K means algorithm if after 1 iteration the percent change
of new centers are smaller than a tolerance
"""
def stop_iteration(self, prev, cur):
for i in range(cur.shape[0]):
if np.sum((cur[i] - prev[i]) / prev[i]) > self.tol:
return False
return True
if __name__ == '__main__':
arr = np.array([[1,1],
[2,3],
[10,4],
[15,16],
[7,1],
[6,3],
[4,7],
[20,10],
[15.2,4],
[3,3],
[4.5,20],
[0, 0],
[20,20],
[19,18],
[3,17],
[12,15],
[1,16]
])
k = KMeans(100, 0.0001)
centers, data = k.algorithm(arr, 3)
X = arr[:, 0]
Y = arr[:, 1]
# plt.plot(X, Y, 'o', label= 'class 1', color='b')
print(centers)
print(data)
sns.set_theme()
plt.figure()
plt.title('3 Means Clustering')
plt.ylabel('y-axis')
plt.xlabel('x-axis')
plt.plot(data[0][:, 0], data[0][:, 1], 'o', label= 'class 1', color='blue')
plt.plot(centers[:, 0], centers[:, 1], 'x', color='blue')
plt.plot(data[1][:, 0], data[1][:, 1], 'o', label='class 2', color='red')
plt.plot(centers[:, 0], centers[:, 1], 'x', color='red')
plt.plot(data[2][:, 0], data[2][:, 1], 'o', label='class 3', color='black')
plt.plot(centers[:, 0], centers[:, 1], 'x', color='black')
plt.legend()
plt.show()