-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
1,350 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
""" | ||
Functions to build and visualize similarity graphs, and to choose epsilon in epsilon-graphs. | ||
""" | ||
|
||
import numpy as np | ||
import matplotlib.pyplot as pyplot | ||
import scipy.spatial.distance as sd | ||
import sys | ||
import os | ||
|
||
from utils import plot_graph_matrix, min_span_tree | ||
from generate_data import worst_case_blob, blobs, two_moons, point_and_circle | ||
|
||
|
||
def build_similarity_graph(X, var=1.0, eps=0.0, k=0): | ||
""" | ||
TO BE COMPLETED. | ||
Computes the similarity matrix for a given dataset of samples. If k=0, builds epsilon graph. Otherwise, builds | ||
kNN graph. | ||
:param X: (n x m) matrix of m-dimensional samples | ||
:param var: the sigma value for the exponential function, already squared | ||
:param eps: threshold eps for epsilon graphs | ||
:param k: the number of neighbours k for k-nn. If zero, use epsilon-graph | ||
:return: | ||
W: (n x n) dimensional matrix representing the adjacency matrix of the graph | ||
""" | ||
n = X.shape[0] | ||
W = np.zeros((n, n)) | ||
|
||
""" | ||
Build similarity graph, before threshold or kNN | ||
similarities: (n x n) matrix with similarities between all possible couples of points. | ||
The similarity function is d(x,y)=exp(-||x-y||^2/(2*var)) | ||
""" | ||
|
||
similarities = np.zeros((n, n)) | ||
|
||
# If epsilon graph | ||
if k == 0: | ||
""" | ||
compute an epsilon graph from the similarities | ||
for each node x_i, an epsilon graph has weights | ||
w_ij = d(x_i,x_j) when w_ij >= eps, and 0 otherwise | ||
""" | ||
pass | ||
|
||
# If kNN graph | ||
if k != 0: | ||
""" | ||
compute a k-nn graph from the similarities | ||
for each node x_i, a k-nn graph has weights | ||
w_ij = d(x_i,x_j) for the k closest nodes to x_i, and 0 | ||
for all the k-n remaining nodes | ||
Remember to remove self similarity and | ||
make the graph undirected | ||
""" | ||
pass | ||
|
||
return W | ||
|
||
|
||
def plot_similarity_graph(X, Y, var=1.0, eps=0.0, k=5): | ||
""" | ||
Function to plot the similarity graph, given data and parameters. | ||
:param X: (n x m) matrix of m-dimensional samples | ||
:param Y: (n, ) vector with cluster assignments | ||
:param var: the sigma value for the exponential function, already squared | ||
:param eps: threshold eps for epsilon graphs | ||
:param k: the number of neighbours k for k-nn | ||
:return: | ||
""" | ||
# use the build_similarity_graph function to build the graph W | ||
# W: (n x n) dimensional matrix representing the adjacency matrix of the graph | ||
W = build_similarity_graph(X, var, eps, k) | ||
|
||
# Use auxiliary function to plot | ||
plot_graph_matrix(X, Y, W) | ||
|
||
|
||
def how_to_choose_epsilon(): | ||
""" | ||
TO BE COMPLETED. | ||
Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j) | ||
representing a fully connected graph. | ||
One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where | ||
(i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold | ||
epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)). | ||
""" | ||
# the number of samples to generate | ||
num_samples = 100 | ||
|
||
# the option necessary for worst_case_blob, try different values | ||
gen_pam = 0.0 # to understand the meaning of the parameter, read worst_case_blob in generate_data.py | ||
|
||
# get blob data | ||
X, Y = worst_case_blob(num_samples, gen_pam) | ||
# X, Y = two_moons(num_samples) | ||
|
||
""" | ||
use the distance function and the min_span_tree function to build the minimal spanning tree min_tree | ||
- var: the exponential_euclidean's sigma2 parameter | ||
- dists: (n x n) matrix with euclidean distance between all possible couples of points | ||
- min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree | ||
""" | ||
var = 1.0 | ||
dists = None # dists[i, j] = euclidean distance between x_i and x_j | ||
min_tree = min_span_tree(dists) | ||
|
||
""" | ||
set threshold epsilon to the max weight in min_tree | ||
""" | ||
distance_threshold = None | ||
# eps = np.exp(-distance_threshold**2.0/(2*var)) | ||
|
||
""" | ||
use the build_similarity_graph function to build the graph W | ||
W: (n x n) dimensional matrix representing | ||
the adjacency matrix of the graph | ||
use plot_graph_matrix to plot the graph | ||
""" | ||
W = build_similarity_graph(X, var=var, eps=eps, k=0) | ||
plot_graph_matrix(X, Y, W) | ||
|
||
|
||
if __name__ == '__main__': | ||
pass |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
""" | ||
Code for generating data in R^2 for clustering algorithms. | ||
""" | ||
|
||
import numpy as np | ||
import sklearn.datasets as skd | ||
|
||
|
||
def worst_case_blob(num_samples, delta=5): | ||
""" | ||
Generates a single blob. | ||
:param num_samples: number of samples to create in the blob | ||
:param delta: | ||
:return: X, (num_samples, 2) matrix of 2-dimensional samples | ||
Y, (num_samples, ) vector of "true" cluster assignment | ||
""" | ||
blob_var = 0.3 | ||
# X: matrix of shape (num_samples, 2) | ||
# Y: vector of shape (num_samples, ) | ||
X, Y = skd.make_blobs(n_samples=num_samples, n_features=2, centers=np.column_stack((0, 0)), cluster_std=blob_var) | ||
X[-1] = [np.max(X) + delta, 0] | ||
return [X, Y] | ||
|
||
|
||
def blobs(num_samples, n_blobs=2, blob_var=0.15, surplus=0): | ||
""" | ||
Creates N gaussian blobs evenly spaced across a circle. | ||
:param num_samples: number of samples to create in the dataset | ||
:param n_blobs: how many separate blobs to create | ||
:param blob_var: gaussian variance of each blob | ||
:param surplus: number of extra samples added to first blob to create unbalanced classes | ||
:return: X, (num_samples, 2) matrix of 2-dimensional samples | ||
Y, (num_samples, ) vector of "true" cluster assignment | ||
""" | ||
# data array | ||
X = np.zeros((num_samples, 2)) | ||
# array containing the indices of the true clusters | ||
Y = np.zeros(num_samples, dtype=np.int32) | ||
|
||
# generate data | ||
block_size = (num_samples-surplus)//n_blobs | ||
|
||
for ii in range(1, n_blobs+1): | ||
start_index = (ii - 1) * block_size | ||
end_index = ii * block_size | ||
if ii == n_blobs: | ||
end_index = num_samples | ||
Y[start_index:end_index] = ii - 1 | ||
nn = end_index - start_index | ||
|
||
X[start_index:end_index, 0] = np.cos(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn) | ||
X[start_index:end_index, 1] = np.sin(2*np.pi*ii/n_blobs) + blob_var*np.random.randn(nn) | ||
return X, Y | ||
|
||
|
||
def two_moons(num_samples, moon_radius=2.0, moon_var=0.02): | ||
""" | ||
Creates two intertwined moons | ||
:param num_samples: number of samples to create in the dataset | ||
:param moon_radius: radius of the moons | ||
:param moon_var: variance of the moons | ||
:return: X, (num_samples, 2) matrix of 2-dimensional samples | ||
Y, (num_samples, ) vector of "true" cluster assignment | ||
""" | ||
X = np.zeros((num_samples, 2)) | ||
|
||
for i in range(int(num_samples / 2)): | ||
r = moon_radius + 4 * i / num_samples | ||
t = i * 3 / num_samples * np.pi | ||
X[i, 0] = r * np.cos(t) | ||
X[i, 1] = r * np.sin(t) | ||
X[i + int(num_samples / 2), 0] = r * np.cos(t + np.pi) | ||
X[i + int(num_samples / 2), 1] = r * np.sin(t + np.pi) | ||
|
||
X = X + np.sqrt(moon_var) * np.random.normal(size=(num_samples, 2)) | ||
Y = np.ones(num_samples) | ||
Y[:int(num_samples / 2) + 1] = 0 | ||
return [X, Y.astype(int)] | ||
|
||
|
||
def point_and_circle(num_samples, radius=2.0, sigma=0.15): | ||
""" | ||
Creates point and circle | ||
:param num_samples: number of samples to create in the dataset | ||
:param sigma: variance | ||
:param radius: radius of the circle | ||
:return: X, (num_samples, 2) matrix of 2-dimensional samples | ||
Y, (num_samples, ) vector of "true" cluster assignment in {0, ..., c-1} | ||
""" | ||
# data array | ||
X = np.zeros((num_samples, 2)) | ||
# array containing the indices of the true clusters | ||
Y = np.zeros(num_samples, dtype=np.int32) | ||
|
||
# generate data | ||
block_size = num_samples // 2 | ||
for ii in range(1, 3): | ||
start_index = (ii - 1) * block_size | ||
end_index = ii * block_size | ||
if ii == 3: | ||
end_index = num_samples | ||
Y[start_index:end_index] = ii - 1 | ||
nn = end_index - start_index | ||
if ii == 1: | ||
X[start_index:end_index, 0] = sigma*np.random.randn(nn) | ||
X[start_index:end_index, 1] = sigma*np.random.randn(nn) | ||
else: | ||
angle = 2*np.pi * np.random.uniform(size=nn) - np.pi | ||
X[start_index:end_index, 0] = radius*np.cos(angle) + sigma * np.random.randn(nn) | ||
X[start_index:end_index, 1] = radius*np.sin(angle) + sigma * np.random.randn(nn) | ||
return X, Y | ||
|
||
|
||
# -------------------------------- | ||
# Visualizing the data | ||
# -------------------------------- | ||
|
||
if __name__ == '__main__': | ||
from utils import plot_clusters | ||
|
||
blobs_data, blobs_clusters = blobs(600, n_blobs=4, surplus=500) | ||
moons_data, moons_clusters = two_moons(600) | ||
point_circle_data, point_circle_clusters = point_and_circle(600) | ||
worst_blobs_data, worst_blobs_clusters = worst_case_blob(600, 5.0) | ||
|
||
print(blobs_data.shape) | ||
# print((blobs_clusters == 0).sum()) | ||
# print((blobs_clusters == 1).sum()) | ||
# print((blobs_clusters == 2).sum()) | ||
# print((blobs_clusters == 3).sum()) | ||
|
||
plot_clusters(blobs_data, blobs_clusters, 'blobs', show=True) | ||
plot_clusters(moons_data, moons_clusters, 'moons', show=False) | ||
plot_clusters(point_circle_data, point_circle_clusters, 'point and circle', show=False) | ||
plot_clusters(worst_blobs_data, worst_blobs_clusters, 'worst case blob', show=True) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from skimage import io | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import os | ||
from utils import * | ||
from build_similarity_graph import build_similarity_graph | ||
from spectral_clustering import build_laplacian, spectral_clustering | ||
|
||
|
||
def image_segmentation(input_img='four_elements.bmp'): | ||
""" | ||
TO BE COMPLETED | ||
Function to perform image segmentation. | ||
:param input_img: name of the image file in /data (e.g. 'four_elements.bmp') | ||
""" | ||
filename = os.path.join('data', input_img) | ||
|
||
X = io.imread(filename) | ||
X = (X - np.min(X)) / (np.max(X) - np.min(X)) | ||
|
||
im_side = np.size(X, 1) | ||
Xr = X.reshape(im_side ** 2, 3) | ||
""" | ||
Y_rec should contain an index from 0 to c-1 where c is the | ||
number of segments you want to split the image into | ||
""" | ||
|
||
""" | ||
Choose parameters | ||
""" | ||
var = 1.0 | ||
k = 2 | ||
laplacian_normalization = 'unn' | ||
chosen_eig_indices = [1] | ||
num_classes = 1 | ||
|
||
W = build_similarity_graph(X, var=var, k=k) | ||
L = build_laplacian(W, laplacian_normalization) | ||
Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) | ||
|
||
plt.figure() | ||
|
||
plt.subplot(1, 2, 1) | ||
plt.imshow(X) | ||
|
||
plt.subplot(1, 2, 2) | ||
Y_rec = Y_rec.reshape(im_side, im_side) | ||
plt.imshow(Y_rec) | ||
|
||
plt.show() | ||
|
||
|
||
if __name__ == '__main__': | ||
image_segmentation() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
numpy | ||
matplotlib | ||
networkx | ||
scipy | ||
scikit-learn | ||
scikit-image |
Oops, something went wrong.