sgkit-dev · mergify · Oct 22, 2020 · Oct 5, 2020 · Oct 9, 2020 · Oct 14, 2020
diff --git a/sgkit/distance/__init__.py b/sgkit/distance/__init__.py
diff --git a/sgkit/distance/api.py b/sgkit/distance/api.py
@@ -0,0 +1,102 @@
+import dask.array as da
+import numpy as np
+
+from sgkit.distance import metrics
+from sgkit.typing import ArrayLike
+
+
+def pairwise_distance(
+    x: ArrayLike,
+    metric: str = "euclidean",
+) -> np.ndarray:
+    """Calculates the pairwise distance between all pairs of vectors in the
-    """Calculates the pairwise distance between all pairs of vectors in the
+    """Calculates the pairwise distance between all pairs of row vectors in the
-    """Calculates the pairwise distance between all pairs of vectors in the
+    """Calculates the pairwise distance between all pairs of row vectors in the
+    given two dimensional array x.
+
+    To illustrate the algorithm consider the following (4, 5) two dimensional array:
+
+    [e.00, e.01, e.02, e.03, e.04]
+    [e.10, e.11, e.12, e.13, e.14]
+    [e.20, e.21, e.22, e.23, e.24]
+    [e.30, e.31, e.32, e.33, e.34]
+
+    The rows of the above matrix are the set of vectors. Now let's label all
+    the vectors as v0, v1, v2, v3.
+
+    The result will be a two dimensional symmetric matrix which will contain
+    the distance between all pairs. Since there are 4 vectors, calculating the
+    distance between each vector and every other vector, will result in 16
+    distances and the resultant array will be of size (4, 4) as follows:
+
+    [v0.v0, v0.v1, v0.v2, v0.v3]
+    [v1.v0, v1.v1, v1.v2, v1.v3]
+    [v2.v0, v2.v1, v2.v2, v2.v3]
+    [v3.v0, v3.v1, v3.v2, v3.v3]
+
+    The (i, j) position in the resulting array (matrix) denotes the distance
+    between vi and vj vectors.
+
+    Negative and nan values are considered as missing values. They are ignored
+    for all distance metric calculations.
+
+    Parameters
+    ----------
+    x
+        [array-like, shape: (M, N)]
+        An array like two dimensional matrix. The rows are the
+        vectors used for comparison, i.e. for pairwise distance.
+    metric
+        The distance metric to use. The distance function can be
+        'euclidean' or 'correlation'.
+
+    Returns
+    -------
+
+    [array-like, shape: (M, N)]
+    A two dimensional distance matrix, which will be symmetric. The dimension
+    will be (M, N). The (i, j) position in the resulting array
+    (matrix) denotes the distance between ith and jth vectors.
-    [array-like, shape: (M, N)]
-    A two dimensional distance matrix, which will be symmetric. The dimension
-    will be (M, N). The (i, j) position in the resulting array
-    (matrix) denotes the distance between ith and jth vectors.
+    [array-like, shape: (M, M)]
+    A two dimensional distance matrix, which will be symmetric. The dimension
+    will be (M, M). The (i, j) position in the resulting array
+    (matrix) denotes the distance between ith and jth row vectors in the input array.
-    [array-like, shape: (M, N)]
-    A two dimensional distance matrix, which will be symmetric. The dimension
-    will be (M, N). The (i, j) position in the resulting array
-    (matrix) denotes the distance between ith and jth vectors.
+    [array-like, shape: (M, M)]
+    A two dimensional distance matrix, which will be symmetric. The dimension
+    will be (M, M). The (i, j) position in the resulting array
+    (matrix) denotes the distance between ith and jth row vectors in the input array.
+
+    Examples
+    --------
+
+    >>> from sgkit.distance.api import pairwise_distance
+    >>> import dask.array as da
+    >>> x = da.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]]).rechunk(2, 2)
+    >>> pairwise_distance(x, metric='euclidean')
+    array([[0.        , 2.44948974, 4.69041576],
+           [2.44948974, 0.        , 5.47722558],
+           [4.69041576, 5.47722558, 0.        ]])
+
+    >>> import numpy as np
+    >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
+    >>> pairwise_distance(x, metric='euclidean')
+    array([[0.        , 2.44948974, 4.69041576],
+           [2.44948974, 0.        , 5.47722558],
+           [4.69041576, 5.47722558, 0.        ]])
+
+    >>> x = np.array([[6, 4, 1,], [4, 5, 2], [9, 7, 3]])
+    >>> pairwise_distance(x, metric='correlation')
+    array([[1.11022302e-16, 2.62956526e-01, 2.82353505e-03],
+           [2.62956526e-01, 0.00000000e+00, 2.14285714e-01],
+           [2.82353505e-03, 2.14285714e-01, 0.00000000e+00]])
+    """
+
+    try:
+        metric_ufunc = getattr(metrics, f"{metric}")
-        metric_ufunc = getattr(metrics, f"{metric}")
+        metric_ufunc = getattr(metrics, metric)
-        metric_ufunc = getattr(metrics, f"{metric}")
+        metric_ufunc = getattr(metrics, metric)
+    except AttributeError:
+        raise NotImplementedError(f"Given metric: {metric} is not implemented.")
+
+    x = da.asarray(x)
+    x_distance = da.blockwise(
+        # Lambda wraps reshape for broadcast
+        lambda _x, _y: metric_ufunc(_x[:, None, :], _y),
+        "jk",
+        x,
+        "ji",
+        x,
+        "ki",
+        dtype="float64",
+        concatenate=True,
+    )
+    x_distance = da.triu(x_distance, 1) + da.triu(x_distance).T
+    return x_distance.compute()
diff --git a/sgkit/distance/metrics.py b/sgkit/distance/metrics.py
@@ -0,0 +1,122 @@
+"""This module implements various distance metrics."""
+
+import numpy as np
+from numba import guvectorize
+
+from sgkit.typing import ArrayLike
+
+
+@guvectorize(  # type: ignore
+    [
+        "void(float32[:], float32[:], float32[:])",
+        "void(float64[:], float64[:], float64[:])",
+        "void(int8[:], int8[:], float64[:])",
+    ],
+    "(n),(n)->()",
+)
+def correlation(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None:
+    """Calculates the correlation between two vectors.
+
+    Parameters
+    ----------
+    x
+        [array-like, shape: (M,)]
+        A vector
+    y
+        [array-like, shape: (M,)]
+        Another vector
+    out
+        The output array, which has the output of pearson correlation.
+
+    Returns
+    -------
+    A scalar representing the pearson correlation coefficient between two vectors x and y.
+
+    Examples
+    --------
+    >>> from sgkit.distance.metrics import correlation
+    >>> import dask.array as da
+    >>> import numpy as np
+    >>> x = da.array([4, 3, 2, 3], dtype='i1')
+    >>> y = da.array([5, 6, 7, 0], dtype='i1')
+    >>> correlation(x, y).compute()
+    1.2626128
+
+    >>> correlation(x, x).compute()
+    -1.1920929e-07
+    """
+    m = x.shape[0]
+    valid_indices = np.zeros(m, dtype=np.float64)
+
+    for i in range(m):
+        if x[i] >= 0 and y[i] >= 0:
+            valid_indices[i] = 1
+
+    valid_shape = valid_indices.sum()
+    _x = np.zeros(int(valid_shape), dtype=x.dtype)
+    _y = np.zeros(int(valid_shape), dtype=y.dtype)
+
+    # Ignore missing values
+    valid_idx = 0
+    for i in range(valid_indices.shape[0]):
+        if valid_indices[i] > 0:
+            _x[valid_idx] = x[i]
+            _y[valid_idx] = y[i]
+            valid_idx += 1
+
+    cov = ((_x - _x.mean()) * (_y - _y.mean())).sum()
+    denom = (_x.std() * _y.std()) / _x.shape[0]
+
+    value = np.nan
+    if denom > 0:
+        value = 1.0 - (cov / (_x.std() * _y.std()) / _x.shape[0])
+    out[0] = value
+
+
+@guvectorize(  # type: ignore
+    [
+        "void(float32[:], float32[:], float32[:])",
+        "void(float64[:], float64[:], float64[:])",
+        "void(int8[:], int8[:], float64[:])",
+    ],
+    "(n),(n)->()",
+)
+def euclidean(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None:
+    """Calculates the euclidean distance between two vectors.
+
+    Parameters
+    ----------
+    x
+        [array-like, shape: (M,)]
+        A vector
+    y
+        [array-like, shape: (M,)]
+        Another vector
+    out
+        The output scalar, which has the output of euclidean between two vectors.
+
+    Returns
+    -------
+    A scalar representing the euclidean distance between two vectors x and y.
+
+    Examples
+    --------
+    >>> from sgkit.distance.metrics import euclidean
+    >>> import dask.array as da
+    >>> import numpy as np
+    >>> x = da.array([4, 3, 2, 3], dtype='i1')
+    >>> y = da.array([5, 6, 7, 0], dtype='i1')
+    >>> euclidean(x, y).compute()
+    6.6332495807108
+
+    >>> euclidean(x, x).compute()
+    0.0
+
+    """
+    square_sum = 0.0
+    m = x.shape[0]
+    # Ignore missing values
+    for i in range(m):
+        if x[i] >= 0 and y[i] >= 0:
+            square_sum += (x[i] - y[i]) ** 2
+    out[0] = np.sqrt(square_sum)
diff --git a/sgkit/tests/test_distance.py b/sgkit/tests/test_distance.py
@@ -0,0 +1,155 @@
+import typing
+
+import dask.array as da
+import numpy as np
+import pytest
+from scipy.spatial.distance import (  # type: ignore
+    correlation,
+    euclidean,
+    pdist,
+    squareform,
+)
+
+from sgkit.distance.api import pairwise_distance
+from sgkit.typing import ArrayLike
+
+
+def get_vectors(
+    array_type: str = "da",
+    dtype: str = "i8",
+    size: typing.Tuple[int, int] = (100, 100),
+    chunk: typing.Tuple[int, int] = (20, 10),
+) -> ArrayLike:
+    if array_type == "da":
+        rs = da.random.RandomState(0)
+        x = rs.randint(0, 3, size=size).astype(dtype).rechunk(chunk)
+    else:
+        x = np.random.rand(size[0], size[1]).astype(dtype)
+    return x
+
+
+def create_distance_matrix(
+    x: ArrayLike, metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64]
+) -> ArrayLike:
+    """
+    Parameters
+    ----------
+    x
+        [array-like, shape: (M, N)]
+        An array like two dimensional matrix. The rows are the
+        vectors used for comparison, i.e. for pairwise distance.
+    metric_func
+        metric function for the distance metric.
+
+    Returns
+    -------
+    A two dimensional distance matrix.
+
+    """
+    m = x.shape[0]
+    distance_matrix = np.zeros((m, m), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[0]):
+            k = np.stack([x[i], x[j]])
+            k = k[:, k.min(axis=0) >= 0]
+            vi, vj = k[0], k[1]
+            try:
+                distance_matrix[i][j] = metric_func(vi, vj)
+            except RuntimeWarning:
+                # unable to calculate distance metric which
+                # which means array contains only one element or
+                # not possible to calculate distance metric
+                distance_matrix[i][j] = np.nan
+    return distance_matrix
+
+
+@pytest.mark.parametrize(
+    "size, chunk",
+    [
+        ((100, 100), (20, 10)),
+        ((100, 100), (25, 10)),
+        ((100, 100), (50, 10)),
+    ],
+)
+def test_distance_correlation(
+    size: typing.Tuple[int, int], chunk: typing.Tuple[int, int]
+) -> None:
+    x = get_vectors(size=size, chunk=chunk)
+    distance_matrix = pairwise_distance(x, metric="correlation")
+    distance_array = pdist(x, metric="correlation")
+    expected_matrix = squareform(distance_array)
+    np.testing.assert_almost_equal(distance_matrix, expected_matrix)
+
+
+@pytest.mark.parametrize(
+    "size, chunk",
+    [
+        ((100, 100), (20, 10)),
+        ((100, 100), (25, 10)),
+        ((100, 100), (50, 10)),
+    ],
+)
+def test_distance_euclidean(
+    size: typing.Tuple[int, int], chunk: typing.Tuple[int, int]
+) -> None:
+    x = get_vectors(size=size, chunk=chunk)
+    distance_matrix = pairwise_distance(x, metric="euclidean")
+    expected_matrix = squareform(pdist(x))
+    np.testing.assert_almost_equal(distance_matrix, expected_matrix)
+
+
+def test_distance_ndarray() -> None:
+    x = get_vectors(array_type="np")
+    distance_matrix = pairwise_distance(x, metric="euclidean")
+    expected_matrix = squareform(pdist(x))
+    np.testing.assert_almost_equal(distance_matrix, expected_matrix)
+
+
+@pytest.mark.parametrize(
+    "metric, metric_func, dtype",
+    [
+        ("euclidean", euclidean, "f8"),
+        ("euclidean", euclidean, "i8"),
+        ("correlation", correlation, "f8"),
+        ("correlation", correlation, "i8"),
+    ],
+)
+def test_missing_values(
+    metric: str,
+    metric_func: typing.Callable[[ArrayLike, ArrayLike], np.float64],
+    dtype: str,
+) -> None:
+    x = get_vectors(array_type="np", dtype=dtype)
+
+    ri_times = np.random.randint(5, 20)
+    m, n = x.shape
+    for i in range(ri_times):
+        if dtype == "f8":
+            x[np.random.randint(0, m)][np.random.randint(0, m)] = np.nan
+        x[np.random.randint(0, m)][np.random.randint(0, m)] = np.random.randint(
+            -100, -1
+        )
+
+    distance_matrix = pairwise_distance(x, metric=metric)
+    expected_matrix = create_distance_matrix(x, metric_func)
+    np.testing.assert_almost_equal(distance_matrix, expected_matrix)
+
+
+@pytest.mark.parametrize(
+    "dtype, expected",
+    [
+        ("i8", "float64"),
+        ("f4", "float32"),
+        ("f8", "float64"),
+    ],
+)
+def test_data_types(dtype, expected):
+    x = get_vectors(dtype=dtype)
+    distance_matrix = pairwise_distance(x)
+    assert distance_matrix.dtype.name == expected
+
+
+def test_undefined_metric() -> None:
+    x = get_vectors(array_type="np")
+    with pytest.raises(NotImplementedError):
+        pairwise_distance(x, metric="not-implemented-metric")