diff --git a/sgkit/distance/__init__.py b/sgkit/distance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sgkit/distance/api.py b/sgkit/distance/api.py new file mode 100644 index 000000000..4f0343cc8 --- /dev/null +++ b/sgkit/distance/api.py @@ -0,0 +1,102 @@ +import dask.array as da +import numpy as np + +from sgkit.distance import metrics +from sgkit.typing import ArrayLike + + +def pairwise_distance( + x: ArrayLike, + metric: str = "euclidean", +) -> np.ndarray: + """Calculates the pairwise distance between all pairs of vectors in the + given two dimensional array x. + + To illustrate the algorithm consider the following (4, 5) two dimensional array: + + [e.00, e.01, e.02, e.03, e.04] + [e.10, e.11, e.12, e.13, e.14] + [e.20, e.21, e.22, e.23, e.24] + [e.30, e.31, e.32, e.33, e.34] + + The rows of the above matrix are the set of vectors. Now let's label all + the vectors as v0, v1, v2, v3. + + The result will be a two dimensional symmetric matrix which will contain + the distance between all pairs. Since there are 4 vectors, calculating the + distance between each vector and every other vector, will result in 16 + distances and the resultant array will be of size (4, 4) as follows: + + [v0.v0, v0.v1, v0.v2, v0.v3] + [v1.v0, v1.v1, v1.v2, v1.v3] + [v2.v0, v2.v1, v2.v2, v2.v3] + [v3.v0, v3.v1, v3.v2, v3.v3] + + The (i, j) position in the resulting array (matrix) denotes the distance + between vi and vj vectors. + + Negative and nan values are considered as missing values. They are ignored + for all distance metric calculations. + + Parameters + ---------- + x + [array-like, shape: (M, N)] + An array like two dimensional matrix. The rows are the + vectors used for comparison, i.e. for pairwise distance. + metric + The distance metric to use. The distance function can be + 'euclidean' or 'correlation'. + + Returns + ------- + + [array-like, shape: (M, N)] + A two dimensional distance matrix, which will be symmetric. The dimension + will be (M, N). The (i, j) position in the resulting array + (matrix) denotes the distance between ith and jth vectors. + + Examples + -------- + + >>> from sgkit.distance.api import pairwise_distance + >>> import dask.array as da + >>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2) + >>> pairwise_distance(x, metric='euclidean') + array([[0. , 2.44948974, 4.69041576], + [2.44948974, 0. , 5.47722558], + [4.69041576, 5.47722558, 0. ]]) + + >>> import numpy as np + >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) + >>> pairwise_distance(x, metric='euclidean') + array([[0. , 2.44948974, 4.69041576], + [2.44948974, 0. , 5.47722558], + [4.69041576, 5.47722558, 0. ]]) + + >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]) + >>> pairwise_distance(x, metric='correlation') + array([[1.11022302e-16, 2.62956526e-01, 2.82353505e-03], + [2.62956526e-01, 0.00000000e+00, 2.14285714e-01], + [2.82353505e-03, 2.14285714e-01, 0.00000000e+00]]) + """ + + try: + metric_ufunc = getattr(metrics, f"{metric}") + except AttributeError: + raise NotImplementedError(f"Given metric: {metric} is not implemented.") + + x = da.asarray(x) + x_distance = da.blockwise( + # Lambda wraps reshape for broadcast + lambda _x, _y: metric_ufunc(_x[:, None, :], _y), + "jk", + x, + "ji", + x, + "ki", + dtype="float64", + concatenate=True, + ) + x_distance = da.triu(x_distance, 1) + da.triu(x_distance).T + return x_distance.compute() diff --git a/sgkit/distance/metrics.py b/sgkit/distance/metrics.py new file mode 100644 index 000000000..f247cdc0a --- /dev/null +++ b/sgkit/distance/metrics.py @@ -0,0 +1,122 @@ +"""This module implements various distance metrics.""" + +import numpy as np +from numba import guvectorize + +from sgkit.typing import ArrayLike + + +@guvectorize( # type: ignore + [ + "void(float32[:], float32[:], float32[:])", + "void(float64[:], float64[:], float64[:])", + "void(int8[:], int8[:], float64[:])", + ], + "(n),(n)->()", +) +def correlation(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None: + """Calculates the correlation between two vectors. + + Parameters + ---------- + x + [array-like, shape: (M,)] + A vector + y + [array-like, shape: (M,)] + Another vector + out + The output array, which has the output of pearson correlation. + + Returns + ------- + A scalar representing the pearson correlation coefficient between two vectors x and y. + + Examples + -------- + >>> from sgkit.distance.metrics import correlation + >>> import dask.array as da + >>> import numpy as np + >>> x = da.array([4, 3, 2, 3], dtype='i1') + >>> y = da.array([5, 6, 7, 0], dtype='i1') + >>> correlation(x, y).compute() + 1.2626128 + + >>> correlation(x, x).compute() + -1.1920929e-07 + """ + m = x.shape[0] + valid_indices = np.zeros(m, dtype=np.float64) + + for i in range(m): + if x[i] >= 0 and y[i] >= 0: + valid_indices[i] = 1 + + valid_shape = valid_indices.sum() + _x = np.zeros(int(valid_shape), dtype=x.dtype) + _y = np.zeros(int(valid_shape), dtype=y.dtype) + + # Ignore missing values + valid_idx = 0 + for i in range(valid_indices.shape[0]): + if valid_indices[i] > 0: + _x[valid_idx] = x[i] + _y[valid_idx] = y[i] + valid_idx += 1 + + cov = ((_x - _x.mean()) * (_y - _y.mean())).sum() + denom = (_x.std() * _y.std()) / _x.shape[0] + + value = np.nan + if denom > 0: + value = 1.0 - (cov / (_x.std() * _y.std()) / _x.shape[0]) + out[0] = value + + +@guvectorize( # type: ignore + [ + "void(float32[:], float32[:], float32[:])", + "void(float64[:], float64[:], float64[:])", + "void(int8[:], int8[:], float64[:])", + ], + "(n),(n)->()", +) +def euclidean(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None: + """Calculates the euclidean distance between two vectors. + + Parameters + ---------- + x + [array-like, shape: (M,)] + A vector + y + [array-like, shape: (M,)] + Another vector + out + The output scalar, which has the output of euclidean between two vectors. + + Returns + ------- + A scalar representing the euclidean distance between two vectors x and y. + + Examples + -------- + >>> from sgkit.distance.metrics import euclidean + >>> import dask.array as da + >>> import numpy as np + >>> x = da.array([4, 3, 2, 3], dtype='i1') + >>> y = da.array([5, 6, 7, 0], dtype='i1') + >>> euclidean(x, y).compute() + 6.6332495807108 + + >>> euclidean(x, x).compute() + 0.0 + + """ + square_sum = 0.0 + m = x.shape[0] + # Ignore missing values + for i in range(m): + if x[i] >= 0 and y[i] >= 0: + square_sum += (x[i] - y[i]) ** 2 + out[0] = np.sqrt(square_sum) diff --git a/sgkit/tests/test_distance.py b/sgkit/tests/test_distance.py new file mode 100644 index 000000000..3432b6421 --- /dev/null +++ b/sgkit/tests/test_distance.py @@ -0,0 +1,155 @@ +import typing + +import dask.array as da +import numpy as np +import pytest +from scipy.spatial.distance import ( # type: ignore + correlation, + euclidean, + pdist, + squareform, +) + +from sgkit.distance.api import pairwise_distance +from sgkit.typing import ArrayLike + + +def get_vectors( + array_type: str = "da", + dtype: str = "i8", + size: typing.Tuple[int, int] = (100, 100), + chunk: typing.Tuple[int, int] = (20, 10), +) -> ArrayLike: + if array_type == "da": + rs = da.random.RandomState(0) + x = rs.randint(0, 3, size=size).astype(dtype).rechunk(chunk) + else: + x = np.random.rand(size[0], size[1]).astype(dtype) + return x + + +def create_distance_matrix( + x: ArrayLike, metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64] +) -> ArrayLike: + """ + Parameters + ---------- + x + [array-like, shape: (M, N)] + An array like two dimensional matrix. The rows are the + vectors used for comparison, i.e. for pairwise distance. + metric_func + metric function for the distance metric. + + Returns + ------- + A two dimensional distance matrix. + + """ + m = x.shape[0] + distance_matrix = np.zeros((m, m), dtype=np.float64) + for i in range(x.shape[0]): + for j in range(x.shape[0]): + k = np.stack([x[i], x[j]]) + k = k[:, k.min(axis=0) >= 0] + vi, vj = k[0], k[1] + try: + distance_matrix[i][j] = metric_func(vi, vj) + except RuntimeWarning: + # unable to calculate distance metric which + # which means array contains only one element or + # not possible to calculate distance metric + distance_matrix[i][j] = np.nan + return distance_matrix + + +@pytest.mark.parametrize( + "size, chunk", + [ + ((100, 100), (20, 10)), + ((100, 100), (25, 10)), + ((100, 100), (50, 10)), + ], +) +def test_distance_correlation( + size: typing.Tuple[int, int], chunk: typing.Tuple[int, int] +) -> None: + x = get_vectors(size=size, chunk=chunk) + distance_matrix = pairwise_distance(x, metric="correlation") + distance_array = pdist(x, metric="correlation") + expected_matrix = squareform(distance_array) + np.testing.assert_almost_equal(distance_matrix, expected_matrix) + + +@pytest.mark.parametrize( + "size, chunk", + [ + ((100, 100), (20, 10)), + ((100, 100), (25, 10)), + ((100, 100), (50, 10)), + ], +) +def test_distance_euclidean( + size: typing.Tuple[int, int], chunk: typing.Tuple[int, int] +) -> None: + x = get_vectors(size=size, chunk=chunk) + distance_matrix = pairwise_distance(x, metric="euclidean") + expected_matrix = squareform(pdist(x)) + np.testing.assert_almost_equal(distance_matrix, expected_matrix) + + +def test_distance_ndarray() -> None: + x = get_vectors(array_type="np") + distance_matrix = pairwise_distance(x, metric="euclidean") + expected_matrix = squareform(pdist(x)) + np.testing.assert_almost_equal(distance_matrix, expected_matrix) + + +@pytest.mark.parametrize( + "metric, metric_func, dtype", + [ + ("euclidean", euclidean, "f8"), + ("euclidean", euclidean, "i8"), + ("correlation", correlation, "f8"), + ("correlation", correlation, "i8"), + ], +) +def test_missing_values( + metric: str, + metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64], + dtype: str, +) -> None: + x = get_vectors(array_type="np", dtype=dtype) + + ri_times = np.random.randint(5, 20) + m, n = x.shape + for i in range(ri_times): + if dtype == "f8": + x[np.random.randint(0, m)][np.random.randint(0, m)] = np.nan + x[np.random.randint(0, m)][np.random.randint(0, m)] = np.random.randint( + -100, -1 + ) + + distance_matrix = pairwise_distance(x, metric=metric) + expected_matrix = create_distance_matrix(x, metric_func) + np.testing.assert_almost_equal(distance_matrix, expected_matrix) + + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("i8", "float64"), + ("f4", "float32"), + ("f8", "float64"), + ], +) +def test_data_types(dtype, expected): + x = get_vectors(dtype=dtype) + distance_matrix = pairwise_distance(x) + assert distance_matrix.dtype.name == expected + + +def test_undefined_metric() -> None: + x = get_vectors(array_type="np") + with pytest.raises(NotImplementedError): + pairwise_distance(x, metric="not-implemented-metric")