Skip to content

Commit

Permalink
Including spectral clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
omardrwch committed Jan 20, 2021
1 parent 1eac308 commit 144f7ae
Show file tree
Hide file tree
Showing 9 changed files with 1,350 additions and 0 deletions.
130 changes: 130 additions & 0 deletions spectral_clustering/build_similarity_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
Functions to build and visualize similarity graphs, and to choose epsilon in epsilon-graphs.
"""

import numpy as np
import matplotlib.pyplot as pyplot
import scipy.spatial.distance as sd
import sys
import os

from utils import plot_graph_matrix, min_span_tree
from generate_data import worst_case_blob, blobs, two_moons, point_and_circle


def build_similarity_graph(X, var=1.0, eps=0.0, k=0):
"""
TO BE COMPLETED.
Computes the similarity matrix for a given dataset of samples. If k=0, builds epsilon graph. Otherwise, builds
kNN graph.
:param X: (n x m) matrix of m-dimensional samples
:param var: the sigma value for the exponential function, already squared
:param eps: threshold eps for epsilon graphs
:param k: the number of neighbours k for k-nn. If zero, use epsilon-graph
:return:
W: (n x n) dimensional matrix representing the adjacency matrix of the graph
"""
n = X.shape[0]
W = np.zeros((n, n))

"""
Build similarity graph, before threshold or kNN
similarities: (n x n) matrix with similarities between all possible couples of points.
The similarity function is d(x,y)=exp(-||x-y||^2/(2*var))
"""

similarities = np.zeros((n, n))

# If epsilon graph
if k == 0:
"""
compute an epsilon graph from the similarities
for each node x_i, an epsilon graph has weights
w_ij = d(x_i,x_j) when w_ij >= eps, and 0 otherwise
"""
pass

# If kNN graph
if k != 0:
"""
compute a k-nn graph from the similarities
for each node x_i, a k-nn graph has weights
w_ij = d(x_i,x_j) for the k closest nodes to x_i, and 0
for all the k-n remaining nodes
Remember to remove self similarity and
make the graph undirected
"""
pass

return W


def plot_similarity_graph(X, Y, var=1.0, eps=0.0, k=5):
"""
Function to plot the similarity graph, given data and parameters.
:param X: (n x m) matrix of m-dimensional samples
:param Y: (n, ) vector with cluster assignments
:param var: the sigma value for the exponential function, already squared
:param eps: threshold eps for epsilon graphs
:param k: the number of neighbours k for k-nn
:return:
"""
# use the build_similarity_graph function to build the graph W
# W: (n x n) dimensional matrix representing the adjacency matrix of the graph
W = build_similarity_graph(X, var, eps, k)

# Use auxiliary function to plot
plot_graph_matrix(X, Y, W)


def how_to_choose_epsilon():
"""
TO BE COMPLETED.
Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
representing a fully connected graph.
One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
(i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
"""
# the number of samples to generate
num_samples = 100

# the option necessary for worst_case_blob, try different values
gen_pam = 0.0 # to understand the meaning of the parameter, read worst_case_blob in generate_data.py

# get blob data
X, Y = worst_case_blob(num_samples, gen_pam)
# X, Y = two_moons(num_samples)

"""
use the distance function and the min_span_tree function to build the minimal spanning tree min_tree
- var: the exponential_euclidean's sigma2 parameter
- dists: (n x n) matrix with euclidean distance between all possible couples of points
- min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree
"""
var = 1.0
dists = None # dists[i, j] = euclidean distance between x_i and x_j
min_tree = min_span_tree(dists)

"""
set threshold epsilon to the max weight in min_tree
"""
distance_threshold = None
# eps = np.exp(-distance_threshold**2.0/(2*var))

"""
use the build_similarity_graph function to build the graph W
W: (n x n) dimensional matrix representing
the adjacency matrix of the graph
use plot_graph_matrix to plot the graph
"""
W = build_similarity_graph(X, var=var, eps=eps, k=0)
plot_graph_matrix(X, Y, W)


if __name__ == '__main__':
pass
Binary file added spectral_clustering/data/four_elements.bmp
Binary file not shown.
Binary file added spectral_clustering/data/fruit_salad.bmp
Binary file not shown.
141 changes: 141 additions & 0 deletions spectral_clustering/generate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
Code for generating data in R^2 for clustering algorithms.
"""

import numpy as np
import sklearn.datasets as skd


def worst_case_blob(num_samples, delta=5):
"""
Generates a single blob.
:param num_samples: number of samples to create in the blob
:param delta:
:return: X, (num_samples, 2) matrix of 2-dimensional samples
Y, (num_samples, ) vector of "true" cluster assignment
"""
blob_var = 0.3
# X: matrix of shape (num_samples, 2)
# Y: vector of shape (num_samples, )
X, Y = skd.make_blobs(n_samples=num_samples, n_features=2, centers=np.column_stack((0, 0)), cluster_std=blob_var)
X[-1] = [np.max(X) + delta, 0]
return [X, Y]


def blobs(num_samples, n_blobs=2, blob_var=0.15, surplus=0):
"""
Creates N gaussian blobs evenly spaced across a circle.
:param num_samples: number of samples to create in the dataset
:param n_blobs: how many separate blobs to create
:param blob_var: gaussian variance of each blob
:param surplus: number of extra samples added to first blob to create unbalanced classes
:return: X, (num_samples, 2) matrix of 2-dimensional samples
Y, (num_samples, ) vector of "true" cluster assignment
"""
# data array
X = np.zeros((num_samples, 2))
# array containing the indices of the true clusters
Y = np.zeros(num_samples, dtype=np.int32)

# generate data
block_size = (num_samples-surplus)//n_blobs

for ii in range(1, n_blobs+1):
start_index = (ii - 1) * block_size
end_index = ii * block_size
if ii == n_blobs:
end_index = num_samples
Y[start_index:end_index] = ii - 1
nn = end_index - start_index

X[start_index:end_index, 0] = np.cos(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn)
X[start_index:end_index, 1] = np.sin(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn)
return X, Y


def two_moons(num_samples, moon_radius=2.0, moon_var=0.02):
"""
Creates two intertwined moons
:param num_samples: number of samples to create in the dataset
:param moon_radius: radius of the moons
:param moon_var: variance of the moons
:return: X, (num_samples, 2) matrix of 2-dimensional samples
Y, (num_samples, ) vector of "true" cluster assignment
"""
X = np.zeros((num_samples, 2))

for i in range(int(num_samples / 2)):
r = moon_radius + 4 * i / num_samples
t = i * 3 / num_samples * np.pi
X[i, 0] = r * np.cos(t)
X[i, 1] = r * np.sin(t)
X[i + int(num_samples / 2), 0] = r * np.cos(t + np.pi)
X[i + int(num_samples / 2), 1] = r * np.sin(t + np.pi)

X = X + np.sqrt(moon_var) * np.random.normal(size=(num_samples, 2))
Y = np.ones(num_samples)
Y[:int(num_samples / 2) + 1] = 0
return [X, Y.astype(int)]


def point_and_circle(num_samples, radius=2.0, sigma=0.15):
"""
Creates point and circle
:param num_samples: number of samples to create in the dataset
:param sigma: variance
:param radius: radius of the circle
:return: X, (num_samples, 2) matrix of 2-dimensional samples
Y, (num_samples, ) vector of "true" cluster assignment in {0, ..., c-1}
"""
# data array
X = np.zeros((num_samples, 2))
# array containing the indices of the true clusters
Y = np.zeros(num_samples, dtype=np.int32)

# generate data
block_size = num_samples // 2
for ii in range(1, 3):
start_index = (ii - 1) * block_size
end_index = ii * block_size
if ii == 3:
end_index = num_samples
Y[start_index:end_index] = ii - 1
nn = end_index - start_index
if ii == 1:
X[start_index:end_index, 0] = sigma*np.random.randn(nn)
X[start_index:end_index, 1] = sigma*np.random.randn(nn)
else:
angle = 2*np.pi * np.random.uniform(size=nn) - np.pi
X[start_index:end_index, 0] = radius*np.cos(angle) + sigma * np.random.randn(nn)
X[start_index:end_index, 1] = radius*np.sin(angle) + sigma * np.random.randn(nn)
return X, Y


# --------------------------------
# Visualizing the data
# --------------------------------

if __name__ == '__main__':
from utils import plot_clusters

blobs_data, blobs_clusters = blobs(600, n_blobs=4, surplus=500)
moons_data, moons_clusters = two_moons(600)
point_circle_data, point_circle_clusters = point_and_circle(600)
worst_blobs_data, worst_blobs_clusters = worst_case_blob(600, 5.0)

print(blobs_data.shape)
# print((blobs_clusters == 0).sum())
# print((blobs_clusters == 1).sum())
# print((blobs_clusters == 2).sum())
# print((blobs_clusters == 3).sum())

plot_clusters(blobs_data, blobs_clusters, 'blobs', show=True)
plot_clusters(moons_data, moons_clusters, 'moons', show=False)
plot_clusters(point_circle_data, point_circle_clusters, 'point and circle', show=False)
plot_clusters(worst_blobs_data, worst_blobs_clusters, 'worst case blob', show=True)


56 changes: 56 additions & 0 deletions spectral_clustering/image_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from skimage import io
import numpy as np
import matplotlib.pyplot as plt
import os
from utils import *
from build_similarity_graph import build_similarity_graph
from spectral_clustering import build_laplacian, spectral_clustering


def image_segmentation(input_img='four_elements.bmp'):
"""
TO BE COMPLETED
Function to perform image segmentation.
:param input_img: name of the image file in /data (e.g. 'four_elements.bmp')
"""
filename = os.path.join('data', input_img)

X = io.imread(filename)
X = (X - np.min(X)) / (np.max(X) - np.min(X))

im_side = np.size(X, 1)
Xr = X.reshape(im_side ** 2, 3)
"""
Y_rec should contain an index from 0 to c-1 where c is the
number of segments you want to split the image into
"""

"""
Choose parameters
"""
var = 1.0
k = 2
laplacian_normalization = 'unn'
chosen_eig_indices = [1]
num_classes = 1

W = build_similarity_graph(X, var=var, k=k)
L = build_laplacian(W, laplacian_normalization)
Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

plt.figure()

plt.subplot(1, 2, 1)
plt.imshow(X)

plt.subplot(1, 2, 2)
Y_rec = Y_rec.reshape(im_side, im_side)
plt.imshow(Y_rec)

plt.show()


if __name__ == '__main__':
image_segmentation()
6 changes: 6 additions & 0 deletions spectral_clustering/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy
matplotlib
networkx
scipy
scikit-learn
scikit-image
Loading

0 comments on commit 144f7ae

Please sign in to comment.