Including spectral clustering

thizirinait · Jan 20, 2021 · 144f7ae · 144f7ae
1 parent 1eac308
commit 144f7ae
Show file tree

Hide file tree

Showing 9 changed files with 1,350 additions and 0 deletions.
diff --git a/spectral_clustering/build_similarity_graph.py b/spectral_clustering/build_similarity_graph.py
@@ -0,0 +1,130 @@
+"""
+Functions to build and visualize similarity graphs, and to choose epsilon in epsilon-graphs.
+"""
+
+import numpy as np
+import matplotlib.pyplot as pyplot
+import scipy.spatial.distance as sd
+import sys
+import os
+
+from utils import plot_graph_matrix, min_span_tree
+from generate_data import worst_case_blob, blobs, two_moons, point_and_circle
+
+
+def build_similarity_graph(X, var=1.0, eps=0.0, k=0):
+    """
+    TO BE COMPLETED.
+
+    Computes the similarity matrix for a given dataset of samples. If k=0, builds epsilon graph. Otherwise, builds
+    kNN graph.
+
+    :param X:    (n x m) matrix of m-dimensional samples
+    :param var:  the sigma value for the exponential function, already squared
+    :param eps:  threshold eps for epsilon graphs
+    :param k:    the number of neighbours k for k-nn. If zero, use epsilon-graph
+    :return:
+        W: (n x n) dimensional matrix representing the adjacency matrix of the graph
+    """
+    n = X.shape[0]
+    W = np.zeros((n, n))
+
+    """
+    Build similarity graph, before threshold or kNN
+    similarities: (n x n) matrix with similarities between all possible couples of points.
+    The similarity function is d(x,y)=exp(-||x-y||^2/(2*var))
+    """
+
+    similarities = np.zeros((n, n))
+
+    # If epsilon graph
+    if k == 0:
+        """
+        compute an epsilon graph from the similarities             
+        for each node x_i, an epsilon graph has weights             
+        w_ij = d(x_i,x_j) when w_ij >= eps, and 0 otherwise          
+        """
+        pass
+
+    # If kNN graph
+    if k != 0:
+        """
+        compute a k-nn graph from the similarities                   
+        for each node x_i, a k-nn graph has weights                  
+        w_ij = d(x_i,x_j) for the k closest nodes to x_i, and 0     
+        for all the k-n remaining nodes                              
+        Remember to remove self similarity and                       
+        make the graph undirected                                    
+        """
+        pass
+
+    return W
+
+
+def plot_similarity_graph(X, Y, var=1.0, eps=0.0, k=5):
+    """
+    Function to plot the similarity graph, given data and parameters.
+
+    :param X: (n x m) matrix of m-dimensional samples
+    :param Y: (n, ) vector with cluster assignments
+    :param var:  the sigma value for the exponential function, already squared
+    :param eps:  threshold eps for epsilon graphs
+    :param k:    the number of neighbours k for k-nn
+    :return:
+    """
+    # use the build_similarity_graph function to build the graph W
+    # W: (n x n) dimensional matrix representing the adjacency matrix of the graph
+    W = build_similarity_graph(X, var, eps, k)
+
+    # Use auxiliary function to plot
+    plot_graph_matrix(X, Y, W)
+
+
+def how_to_choose_epsilon():
+    """
+    TO BE COMPLETED.
+
+    Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
+    representing a fully connected graph.
+    One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
+    (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
+    epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
+    """
+    # the number of samples to generate
+    num_samples = 100
+
+    # the option necessary for worst_case_blob, try different values
+    gen_pam = 0.0  # to understand the meaning of the parameter, read worst_case_blob in generate_data.py
+
+    # get blob data
+    X, Y = worst_case_blob(num_samples, gen_pam)
+    # X, Y = two_moons(num_samples)
+
+    """
+     use the distance function and the min_span_tree function to build the minimal spanning tree min_tree                   
+     - var: the exponential_euclidean's sigma2 parameter          
+     - dists: (n x n) matrix with euclidean distance between all possible couples of points                   
+     - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree                           
+    """
+    var = 1.0
+    dists = None  # dists[i, j] = euclidean distance between x_i and x_j
+    min_tree = min_span_tree(dists)
+
+    """
+    set threshold epsilon to the max weight in min_tree 
+    """
+    distance_threshold = None
+    # eps = np.exp(-distance_threshold**2.0/(2*var))
+
+    """
+    use the build_similarity_graph function to build the graph W  
+     W: (n x n) dimensional matrix representing                    
+        the adjacency matrix of the graph
+       use plot_graph_matrix to plot the graph                    
+    """
+    W = build_similarity_graph(X, var=var, eps=eps, k=0)
+    plot_graph_matrix(X, Y, W)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/spectral_clustering/data/four_elements.bmp b/spectral_clustering/data/four_elements.bmp
diff --git a/spectral_clustering/data/fruit_salad.bmp b/spectral_clustering/data/fruit_salad.bmp
diff --git a/spectral_clustering/generate_data.py b/spectral_clustering/generate_data.py
@@ -0,0 +1,141 @@
+"""
+Code for generating data in R^2 for clustering algorithms.
+"""
+
+import numpy as np
+import sklearn.datasets as skd
+
+
+def worst_case_blob(num_samples, delta=5):
+    """
+    Generates a single blob.
+
+    :param num_samples: number of samples to create in the blob
+    :param delta:
+    :return: X,  (num_samples, 2) matrix of 2-dimensional samples
+             Y,  (num_samples, ) vector of "true" cluster assignment
+    """
+    blob_var = 0.3
+    # X: matrix of shape (num_samples, 2)
+    # Y: vector of shape (num_samples, )
+    X, Y = skd.make_blobs(n_samples=num_samples, n_features=2, centers=np.column_stack((0, 0)), cluster_std=blob_var)
+    X[-1] = [np.max(X) + delta, 0]
+    return [X, Y]
+
+
+def blobs(num_samples, n_blobs=2, blob_var=0.15, surplus=0):
+    """
+    Creates N gaussian blobs evenly spaced across a circle.
+
+    :param num_samples: number of samples to create in the dataset
+    :param n_blobs:      how many separate blobs to create
+    :param blob_var:    gaussian variance of each blob
+    :param surplus:     number of extra samples added to first blob to create unbalanced classes
+    :return: X,  (num_samples, 2) matrix of 2-dimensional samples
+             Y,  (num_samples, ) vector of "true" cluster assignment
+    """
+    # data array
+    X = np.zeros((num_samples, 2))
+    # array containing the indices of the true clusters
+    Y = np.zeros(num_samples, dtype=np.int32)
+
+    # generate data
+    block_size = (num_samples-surplus)//n_blobs
+
+    for ii in range(1, n_blobs+1):
+        start_index = (ii - 1) * block_size
+        end_index = ii * block_size
+        if ii == n_blobs:
+            end_index = num_samples
+        Y[start_index:end_index] = ii - 1
+        nn = end_index - start_index
+
+        X[start_index:end_index, 0] = np.cos(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn)
+        X[start_index:end_index, 1] = np.sin(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn)
+    return X, Y
+
+
+def two_moons(num_samples, moon_radius=2.0, moon_var=0.02):
+    """
+    Creates two intertwined moons
+
+    :param num_samples: number of samples to create in the dataset
+    :param moon_radius: radius of the moons
+    :param moon_var:    variance of the moons
+    :return: X,  (num_samples, 2) matrix of 2-dimensional samples
+             Y,  (num_samples, ) vector of "true" cluster assignment
+    """
+    X = np.zeros((num_samples, 2))
+
+    for i in range(int(num_samples / 2)):
+        r = moon_radius + 4 * i / num_samples
+        t = i * 3 / num_samples * np.pi
+        X[i, 0] = r * np.cos(t)
+        X[i, 1] = r * np.sin(t)
+        X[i + int(num_samples / 2), 0] = r * np.cos(t + np.pi)
+        X[i + int(num_samples / 2), 1] = r * np.sin(t + np.pi)
+
+    X = X + np.sqrt(moon_var) * np.random.normal(size=(num_samples, 2))
+    Y = np.ones(num_samples)
+    Y[:int(num_samples / 2) + 1] = 0
+    return [X, Y.astype(int)]
+
+
+def point_and_circle(num_samples, radius=2.0, sigma=0.15):
+    """
+    Creates point and circle
+
+    :param num_samples: number of samples to create in the dataset
+    :param sigma:       variance
+    :param radius:      radius of the circle
+    :return: X,  (num_samples, 2) matrix of 2-dimensional samples
+             Y,  (num_samples, ) vector of "true" cluster assignment in {0, ..., c-1}
+    """
+    # data array
+    X = np.zeros((num_samples, 2))
+    # array containing the indices of the true clusters
+    Y = np.zeros(num_samples, dtype=np.int32)
+
+    # generate data
+    block_size = num_samples // 2
+    for ii in range(1, 3):
+        start_index = (ii - 1) * block_size
+        end_index = ii * block_size
+        if ii == 3:
+            end_index = num_samples
+        Y[start_index:end_index] = ii - 1
+        nn = end_index - start_index
+        if ii == 1:
+            X[start_index:end_index, 0] = sigma*np.random.randn(nn)
+            X[start_index:end_index, 1] = sigma*np.random.randn(nn)
+        else:
+            angle = 2*np.pi * np.random.uniform(size=nn) - np.pi
+            X[start_index:end_index, 0] = radius*np.cos(angle) + sigma * np.random.randn(nn)
+            X[start_index:end_index, 1] = radius*np.sin(angle) + sigma * np.random.randn(nn)
+    return X, Y
+
+
+# --------------------------------
+# Visualizing the data
+# --------------------------------
+
+if __name__ == '__main__':
+    from utils import plot_clusters
+
+    blobs_data, blobs_clusters = blobs(600, n_blobs=4, surplus=500)
+    moons_data, moons_clusters = two_moons(600)
+    point_circle_data, point_circle_clusters = point_and_circle(600)
+    worst_blobs_data, worst_blobs_clusters = worst_case_blob(600, 5.0)
+
+    print(blobs_data.shape)
+    # print((blobs_clusters == 0).sum())
+    # print((blobs_clusters == 1).sum())
+    # print((blobs_clusters == 2).sum())
+    # print((blobs_clusters == 3).sum())
+
+    plot_clusters(blobs_data, blobs_clusters, 'blobs', show=True)
+    plot_clusters(moons_data, moons_clusters, 'moons', show=False)
+    plot_clusters(point_circle_data, point_circle_clusters, 'point and circle', show=False)
+    plot_clusters(worst_blobs_data, worst_blobs_clusters, 'worst case blob', show=True)
+
+
diff --git a/spectral_clustering/image_segmentation.py b/spectral_clustering/image_segmentation.py
@@ -0,0 +1,56 @@
+from skimage import io
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+from utils import *
+from build_similarity_graph import build_similarity_graph
+from spectral_clustering import build_laplacian, spectral_clustering
+
+
+def image_segmentation(input_img='four_elements.bmp'):
+    """
+    TO BE COMPLETED
+
+    Function to perform image segmentation.
+
+    :param input_img: name of the image file in /data (e.g. 'four_elements.bmp')
+    """
+    filename = os.path.join('data', input_img)
+
+    X = io.imread(filename)
+    X = (X - np.min(X)) / (np.max(X) - np.min(X))
+
+    im_side = np.size(X, 1)
+    Xr = X.reshape(im_side ** 2, 3)
+    """
+    Y_rec should contain an index from 0 to c-1 where c is the     
+     number of segments you want to split the image into          
+    """
+
+    """
+    Choose parameters
+    """
+    var = 1.0
+    k = 2
+    laplacian_normalization = 'unn'
+    chosen_eig_indices = [1]
+    num_classes = 1
+
+    W = build_similarity_graph(X, var=var, k=k)
+    L = build_laplacian(W, laplacian_normalization)
+    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
+
+    plt.figure()
+
+    plt.subplot(1, 2, 1)
+    plt.imshow(X)
+
+    plt.subplot(1, 2, 2)
+    Y_rec = Y_rec.reshape(im_side, im_side)
+    plt.imshow(Y_rec)
+
+    plt.show()
+
+
+if __name__ == '__main__':
+    image_segmentation()
diff --git a/spectral_clustering/requirements.txt b/spectral_clustering/requirements.txt
@@ -0,0 +1,6 @@
+numpy
+matplotlib
+networkx
+scipy
+scikit-learn
+scikit-image