-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
kmeansclustering.py
97 lines (69 loc) · 2.87 KB
/
kmeansclustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
From scratch implementation of K means clustering which is a unsupervised
clustering method that works by iteratively computing new centroids and
moving centroids to the center of the new formed clusters.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-05-28 Initial coding
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
class KMeansClustering:
def __init__(self, X, num_clusters):
self.K = num_clusters
self.max_iterations = 100
self.plot_figure = True
self.num_examples = X.shape[0]
self.num_features = X.shape[1]
def initialize_random_centroids(self, X):
centroids = np.zeros((self.K, self.num_features))
for k in range(self.K):
centroid = X[np.random.choice(range(self.num_examples))]
centroids[k] = centroid
return centroids
def create_clusters(self, X, centroids):
# Will contain a list of the points that are associated with that specific cluster
clusters = [[] for _ in range(self.K)]
# Loop through each point and check which is the closest cluster
for point_idx, point in enumerate(X):
closest_centroid = np.argmin(
np.sqrt(np.sum((point - centroids) ** 2, axis=1))
)
clusters[closest_centroid].append(point_idx)
return clusters
def calculate_new_centroids(self, clusters, X):
centroids = np.zeros((self.K, self.num_features))
for idx, cluster in enumerate(clusters):
new_centroid = np.mean(X[cluster], axis=0)
centroids[idx] = new_centroid
return centroids
def predict_cluster(self, clusters, X):
y_pred = np.zeros(self.num_examples)
for cluster_idx, cluster in enumerate(clusters):
for sample_idx in cluster:
y_pred[sample_idx] = cluster_idx
return y_pred
def plot_fig(self, X, y):
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.show()
def fit(self, X):
centroids = self.initialize_random_centroids(X)
for it in range(self.max_iterations):
clusters = self.create_clusters(X, centroids)
previous_centroids = centroids
centroids = self.calculate_new_centroids(clusters, X)
diff = centroids - previous_centroids
if not diff.any():
print("Termination criterion satisfied")
break
# Get label predictions
y_pred = self.predict_cluster(clusters, X)
if self.plot_figure:
self.plot_fig(X, y_pred)
return y_pred
if __name__ == "__main__":
np.random.seed(10)
num_clusters = 3
X, _ = make_blobs(n_samples=1000, n_features=2, centers=num_clusters)
Kmeans = KMeansClustering(X, num_clusters)
y_pred = Kmeans.fit(X)