Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement pairwise distance calculation #306

Merged
merged 20 commits into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added sgkit/distance/__init__.py
Empty file.
102 changes: 102 additions & 0 deletions sgkit/distance/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import dask.array as da
import numpy as np

from sgkit.distance import metrics
from sgkit.typing import ArrayLike


def pairwise_distance(
x: ArrayLike,
metric: str = "euclidean",
) -> np.ndarray:
"""Calculates the pairwise distance between all pairs of vectors in the
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""Calculates the pairwise distance between all pairs of vectors in the
"""Calculates the pairwise distance between all pairs of row vectors in the

Just to help make obvious this is computing distance between rows. (And thus when computing distance between samples, user will have to transpose input.)

given two dimensional array x.

To illustrate the algorithm consider the following (4, 5) two dimensional array:

[e.00, e.01, e.02, e.03, e.04]
[e.10, e.11, e.12, e.13, e.14]
[e.20, e.21, e.22, e.23, e.24]
[e.30, e.31, e.32, e.33, e.34]

The rows of the above matrix are the set of vectors. Now let's label all
the vectors as v0, v1, v2, v3.

The result will be a two dimensional symmetric matrix which will contain
the distance between all pairs. Since there are 4 vectors, calculating the
distance between each vector and every other vector, will result in 16
distances and the resultant array will be of size (4, 4) as follows:

[v0.v0, v0.v1, v0.v2, v0.v3]
[v1.v0, v1.v1, v1.v2, v1.v3]
[v2.v0, v2.v1, v2.v2, v2.v3]
[v3.v0, v3.v1, v3.v2, v3.v3]

The (i, j) position in the resulting array (matrix) denotes the distance
between vi and vj vectors.

Negative and nan values are considered as missing values. They are ignored
for all distance metric calculations.

Parameters
----------
x
[array-like, shape: (M, N)]
An array like two dimensional matrix. The rows are the
vectors used for comparison, i.e. for pairwise distance.
metric
The distance metric to use. The distance function can be
'euclidean' or 'correlation'.

Returns
-------

[array-like, shape: (M, N)]
A two dimensional distance matrix, which will be symmetric. The dimension
will be (M, N). The (i, j) position in the resulting array
(matrix) denotes the distance between ith and jth vectors.
Comment on lines +54 to +57
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[array-like, shape: (M, N)]
A two dimensional distance matrix, which will be symmetric. The dimension
will be (M, N). The (i, j) position in the resulting array
(matrix) denotes the distance between ith and jth vectors.
[array-like, shape: (M, M)]
A two dimensional distance matrix, which will be symmetric. The dimension
will be (M, M). The (i, j) position in the resulting array
(matrix) denotes the distance between ith and jth row vectors in the input array.


Examples
--------

>>> from sgkit.distance.api import pairwise_distance
>>> import dask.array as da
>>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2)
>>> pairwise_distance(x, metric='euclidean')
array([[0. , 2.44948974, 4.69041576],
[2.44948974, 0. , 5.47722558],
[4.69041576, 5.47722558, 0. ]])

>>> import numpy as np
>>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
>>> pairwise_distance(x, metric='euclidean')
array([[0. , 2.44948974, 4.69041576],
[2.44948974, 0. , 5.47722558],
[4.69041576, 5.47722558, 0. ]])

>>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
>>> pairwise_distance(x, metric='correlation')
aktech marked this conversation as resolved.
Show resolved Hide resolved
array([[1.11022302e-16, 2.62956526e-01, 2.82353505e-03],
[2.62956526e-01, 0.00000000e+00, 2.14285714e-01],
[2.82353505e-03, 2.14285714e-01, 0.00000000e+00]])
"""

try:
metric_ufunc = getattr(metrics, f"{metric}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
metric_ufunc = getattr(metrics, f"{metric}")
metric_ufunc = getattr(metrics, metric)

except AttributeError:
raise NotImplementedError(f"Given metric: {metric} is not implemented.")

x = da.asarray(x)
x_distance = da.blockwise(
# Lambda wraps reshape for broadcast
lambda _x, _y: metric_ufunc(_x[:, None, :], _y),
"jk",
x,
"ji",
x,
"ki",
dtype="float64",
concatenate=True,
)
x_distance = da.triu(x_distance, 1) + da.triu(x_distance).T
return x_distance.compute()
122 changes: 122 additions & 0 deletions sgkit/distance/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""This module implements various distance metrics."""

import numpy as np
from numba import guvectorize

from sgkit.typing import ArrayLike


@guvectorize( # type: ignore
[
"void(float32[:], float32[:], float32[:])",
"void(float64[:], float64[:], float64[:])",
"void(int8[:], int8[:], float64[:])",
],
"(n),(n)->()",
)
def correlation(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None:
aktech marked this conversation as resolved.
Show resolved Hide resolved
"""Calculates the correlation between two vectors.

Parameters
----------
x
[array-like, shape: (M,)]
A vector
y
[array-like, shape: (M,)]
Another vector
out
The output array, which has the output of pearson correlation.

Returns
-------
A scalar representing the pearson correlation coefficient between two vectors x and y.

Examples
--------
>>> from sgkit.distance.metrics import correlation
>>> import dask.array as da
>>> import numpy as np
>>> x = da.array([4, 3, 2, 3], dtype='i1')
>>> y = da.array([5, 6, 7, 0], dtype='i1')
>>> correlation(x, y).compute()
1.2626128

>>> correlation(x, x).compute()
-1.1920929e-07
"""
m = x.shape[0]
valid_indices = np.zeros(m, dtype=np.float64)

for i in range(m):
if x[i] >= 0 and y[i] >= 0:
valid_indices[i] = 1

valid_shape = valid_indices.sum()
_x = np.zeros(int(valid_shape), dtype=x.dtype)
_y = np.zeros(int(valid_shape), dtype=y.dtype)

# Ignore missing values
valid_idx = 0
for i in range(valid_indices.shape[0]):
if valid_indices[i] > 0:
_x[valid_idx] = x[i]
_y[valid_idx] = y[i]
valid_idx += 1

cov = ((_x - _x.mean()) * (_y - _y.mean())).sum()
denom = (_x.std() * _y.std()) / _x.shape[0]

value = np.nan
if denom > 0:
value = 1.0 - (cov / (_x.std() * _y.std()) / _x.shape[0])
out[0] = value


@guvectorize( # type: ignore
[
"void(float32[:], float32[:], float32[:])",
"void(float64[:], float64[:], float64[:])",
"void(int8[:], int8[:], float64[:])",
],
"(n),(n)->()",
)
def euclidean(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None:
"""Calculates the euclidean distance between two vectors.

Parameters
----------
x
[array-like, shape: (M,)]
A vector
y
[array-like, shape: (M,)]
Another vector
out
The output scalar, which has the output of euclidean between two vectors.

Returns
-------
A scalar representing the euclidean distance between two vectors x and y.

Examples
--------
>>> from sgkit.distance.metrics import euclidean
>>> import dask.array as da
>>> import numpy as np
>>> x = da.array([4, 3, 2, 3], dtype='i1')
>>> y = da.array([5, 6, 7, 0], dtype='i1')
>>> euclidean(x, y).compute()
6.6332495807108

>>> euclidean(x, x).compute()
0.0

"""
square_sum = 0.0
m = x.shape[0]
# Ignore missing values
for i in range(m):
if x[i] >= 0 and y[i] >= 0:
square_sum += (x[i] - y[i]) ** 2
out[0] = np.sqrt(square_sum)
155 changes: 155 additions & 0 deletions sgkit/tests/test_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import typing

import dask.array as da
import numpy as np
import pytest
from scipy.spatial.distance import ( # type: ignore
correlation,
euclidean,
pdist,
squareform,
)

from sgkit.distance.api import pairwise_distance
from sgkit.typing import ArrayLike


def get_vectors(
array_type: str = "da",
dtype: str = "i8",
size: typing.Tuple[int, int] = (100, 100),
chunk: typing.Tuple[int, int] = (20, 10),
) -> ArrayLike:
if array_type == "da":
rs = da.random.RandomState(0)
x = rs.randint(0, 3, size=size).astype(dtype).rechunk(chunk)
else:
x = np.random.rand(size[0], size[1]).astype(dtype)
return x


def create_distance_matrix(
x: ArrayLike, metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64]
) -> ArrayLike:
"""
Parameters
----------
x
[array-like, shape: (M, N)]
An array like two dimensional matrix. The rows are the
vectors used for comparison, i.e. for pairwise distance.
metric_func
metric function for the distance metric.

Returns
-------
A two dimensional distance matrix.

"""
m = x.shape[0]
distance_matrix = np.zeros((m, m), dtype=np.float64)
for i in range(x.shape[0]):
for j in range(x.shape[0]):
k = np.stack([x[i], x[j]])
k = k[:, k.min(axis=0) >= 0]
vi, vj = k[0], k[1]
try:
distance_matrix[i][j] = metric_func(vi, vj)
except RuntimeWarning:
# unable to calculate distance metric which
# which means array contains only one element or
# not possible to calculate distance metric
distance_matrix[i][j] = np.nan
return distance_matrix


@pytest.mark.parametrize(
"size, chunk",
[
((100, 100), (20, 10)),
((100, 100), (25, 10)),
((100, 100), (50, 10)),
],
)
def test_distance_correlation(
size: typing.Tuple[int, int], chunk: typing.Tuple[int, int]
) -> None:
x = get_vectors(size=size, chunk=chunk)
distance_matrix = pairwise_distance(x, metric="correlation")
distance_array = pdist(x, metric="correlation")
expected_matrix = squareform(distance_array)
np.testing.assert_almost_equal(distance_matrix, expected_matrix)


@pytest.mark.parametrize(
"size, chunk",
[
((100, 100), (20, 10)),
((100, 100), (25, 10)),
((100, 100), (50, 10)),
],
)
def test_distance_euclidean(
size: typing.Tuple[int, int], chunk: typing.Tuple[int, int]
) -> None:
x = get_vectors(size=size, chunk=chunk)
distance_matrix = pairwise_distance(x, metric="euclidean")
expected_matrix = squareform(pdist(x))
np.testing.assert_almost_equal(distance_matrix, expected_matrix)


def test_distance_ndarray() -> None:
x = get_vectors(array_type="np")
distance_matrix = pairwise_distance(x, metric="euclidean")
expected_matrix = squareform(pdist(x))
np.testing.assert_almost_equal(distance_matrix, expected_matrix)


@pytest.mark.parametrize(
"metric, metric_func, dtype",
[
("euclidean", euclidean, "f8"),
("euclidean", euclidean, "i8"),
("correlation", correlation, "f8"),
("correlation", correlation, "i8"),
],
)
def test_missing_values(
metric: str,
metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64],
dtype: str,
) -> None:
x = get_vectors(array_type="np", dtype=dtype)

ri_times = np.random.randint(5, 20)
m, n = x.shape
for i in range(ri_times):
if dtype == "f8":
x[np.random.randint(0, m)][np.random.randint(0, m)] = np.nan
x[np.random.randint(0, m)][np.random.randint(0, m)] = np.random.randint(
-100, -1
)

distance_matrix = pairwise_distance(x, metric=metric)
expected_matrix = create_distance_matrix(x, metric_func)
np.testing.assert_almost_equal(distance_matrix, expected_matrix)


@pytest.mark.parametrize(
"dtype, expected",
[
("i8", "float64"),
("f4", "float32"),
("f8", "float64"),
],
)
def test_data_types(dtype, expected):
x = get_vectors(dtype=dtype)
distance_matrix = pairwise_distance(x)
assert distance_matrix.dtype.name == expected


def test_undefined_metric() -> None:
x = get_vectors(array_type="np")
with pytest.raises(NotImplementedError):
pairwise_distance(x, metric="not-implemented-metric")