forked from ahmedfgad/GeneticAlgorithmPython
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_clustering_2.py
122 lines (98 loc) · 5.58 KB
/
example_clustering_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy
import matplotlib.pyplot
import pygad
cluster1_num_samples = 10
cluster1_x1_start = 0
cluster1_x1_end = 5
cluster1_x2_start = 2
cluster1_x2_end = 6
cluster1_x1 = numpy.random.random(size=(cluster1_num_samples))
cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start
cluster1_x2 = numpy.random.random(size=(cluster1_num_samples))
cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start
cluster2_num_samples = 10
cluster2_x1_start = 10
cluster2_x1_end = 15
cluster2_x2_start = 8
cluster2_x2_end = 12
cluster2_x1 = numpy.random.random(size=(cluster2_num_samples))
cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start
cluster2_x2 = numpy.random.random(size=(cluster2_num_samples))
cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start
c1 = numpy.array([cluster1_x1, cluster1_x2]).T
c2 = numpy.array([cluster2_x1, cluster2_x2]).T
data = numpy.concatenate((c1, c2), axis=0)
matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2)
matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2)
matplotlib.pyplot.title("Optimal Clustering")
matplotlib.pyplot.show()
def euclidean_distance(X, Y):
"""
Calculate the euclidean distance between X and Y. It accepts:
:X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
:Y should be of size f. In other words, it is a single sample.
Returns a vector of N elements with the distances between the N samples and the Y.
"""
return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1))
def cluster_data(solution, solution_idx):
"""
Clusters the data based on the current solution.
"""
global num_cluster, data
feature_vector_length = data.shape[1]
cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster.
clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster.
for clust_idx in range(num_clusters):
# Return the current cluster center.
cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
# Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
all_clusters_dists.append(numpy.array(cluster_center_dists))
cluster_centers = numpy.array(cluster_centers)
all_clusters_dists = numpy.array(all_clusters_dists)
# A 1D array that, for each sample, holds the index of the cluster with the smallest distance.
# In other words, the array holds the sample's cluster index.
cluster_indices = numpy.argmin(all_clusters_dists, axis=0)
for clust_idx in range(num_clusters):
clusters.append(numpy.where(cluster_indices == clust_idx)[0])
# Calculate the sum of distances for the cluster.
if len(clusters[clust_idx]) == 0:
# In case the cluster is empty (i.e. has zero samples).
clusters_sum_dist.append(0)
else:
# When the cluster is not empty (i.e. has at least 1 sample).
clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
# clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
clusters_sum_dist = numpy.array(clusters_sum_dist)
return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist
def fitness_func(solution, solution_idx):
_, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)
# The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001)
return fitness
num_clusters = 2
num_genes = num_clusters * data.shape[1]
ga_instance = pygad.GA(num_generations=100,
sol_per_pop=10,
num_parents_mating=5,
init_range_low=-6,
init_range_high=20,
keep_parents=2,
num_genes=num_genes,
fitness_func=fitness_func,
suppress_warnings=True)
ga_instance.run()
best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution()
print("Best solution is {bs}".format(bs=best_solution))
print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx)
for cluster_idx in range(num_clusters):
cluster_x = data[clusters[cluster_idx], 0]
cluster_y = data[clusters[cluster_idx], 1]
matplotlib.pyplot.scatter(cluster_x, cluster_y)
matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5)
matplotlib.pyplot.title("Clustering using PyGAD")
matplotlib.pyplot.show()