Skip to content

Commit

Permalink
Merge pull request #26 from scottgigante/develop
Browse files Browse the repository at this point in the history
Add array_emd function
  • Loading branch information
wmayner authored Jan 22, 2018
2 parents 67accba + da5675e commit 56ea285
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 9 deletions.
11 changes: 10 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Usage
>>> import numpy as np
>>> first_histogram = np.array([0.0, 1.0])
>>> second_histogram = np.array([5.0, 3.0])
>>> distance_matrix = np.array([[0.0, 0.5],
>>> distance_matrix = np.array([[0.0, 0.5],
... [0.5, 0.0]])
>>> emd(first_histogram, second_histogram, distance_matrix)
3.5
Expand All @@ -51,6 +51,15 @@ You can also get the associated minimum-cost flow:
>>> emd_with_flow(first_histogram, second_histogram, distance_matrix)
(3.5, [[0.0, 0.0], [0.0, 1.0]])
You can also calculate the EMD directly from two arrays of observations:

.. code:: python
>>> from pyemd import emd_samples
>>> first_array = [1,2,3,4]
>>> second_array = [2,3,4,5]
>>> emd_samples(first_array, second_array, bins=2)
0.5
API
~~~
Expand Down
11 changes: 9 additions & 2 deletions pyemd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@
>>> emd_with_flow(first_signature, second_signature, distance_matrix)
(3.5, [[0.0, 0.0], [0.0, 1.0]])
You can also calculate the EMD directly from two arrays of observations:
>>> from pyemd import emd_samples
>>> first_array = [1,2,3,4]
>>> second_array = [2,3,4,5]
>>> emd_samples(first_array, second_array, bins=2)
0.5
Limitations and Caveats
~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -64,5 +72,4 @@
"""

from .__about__ import *
from .emd import emd
from .emd import emd_with_flow
from .emd import emd, emd_with_flow, emd_samples
96 changes: 91 additions & 5 deletions pyemd/emd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ cimport numpy as np
cdef extern from "lib/emd_hat.hpp":

cdef double \
emd_hat_gd_metric_double(vector[double],
emd_hat_gd_metric_double(vector[double],
vector[double],
vector[vector[double]],
vector[vector[double]],
double) except +

cdef pair[double, vector[vector[double]]] \
emd_hat_gd_metric_double_with_flow_wrapper(vector[double],
emd_hat_gd_metric_double_with_flow_wrapper(vector[double],
vector[double],
vector[vector[double]],
double) except +
Expand All @@ -46,6 +46,12 @@ def validate(first_histogram, second_histogram, distance_matrix):
raise ValueError('Histogram lengths must be equal')


def euclidean_pairwise_distance(x):
"""Calculate the euclidean pairwise distance matrix for a 1D array"""
distance = np.abs(np.repeat(x, len(x)) - np.tile(x, len(x)))
return distance.reshape(len(x), len(x))


def emd(np.ndarray[np.float64_t, ndim=1, mode="c"] first_histogram,
np.ndarray[np.float64_t, ndim=1, mode="c"] second_histogram,
np.ndarray[np.float64_t, ndim=2, mode="c"] distance_matrix,
Expand Down Expand Up @@ -86,12 +92,92 @@ def emd(np.ndarray[np.float64_t, ndim=1, mode="c"] first_histogram,
aren't the same length.
"""
validate(first_histogram, second_histogram, distance_matrix)
return emd_hat_gd_metric_double(first_histogram,
return emd_hat_gd_metric_double(first_histogram,
second_histogram,
distance_matrix,
distance_matrix,
extra_mass_penalty)


def emd_samples(first_array,
second_array,
extra_mass_penalty=DEFAULT_EXTRA_MASS_PENALTY,
distance='euclidean',
normalized=True,
bins='auto',
range=None):
u"""Return the EMD between the histograms of two arrays.
Arguments:
first_array (np.ndarray): A 1-dimensional array of type np.float64.
second_array (np.ndarray): A 1-dimensional array of type np.float64.
Keyword Arguments:
extra_mass_penalty: The penalty for extra mass. If you want the
resulting distance to be a metric, it should be at least half the
diameter of the space (maximum possible distance between any two
points). If you want partial matching you can set it to zero (but then
the resulting distance is not guaranteed to be a metric). The default
value is -1, which means the maximum value in the distance matrix is
used.
bins (int or string): The number of bins to include in the generated
histogram. If a string, must be one of the bin selection algorithms
accepted by `np.histogram`. Defaults to 'auto', which gives the
maximum of the ‘sturges’ and ‘fd’ estimators.
distance (string or function): A string or function implementing a
metric on a 1D np.ndarray. Default to euclidean distance.
Currently limited to 'euclidean' or your own function which takes
for input a 1D array and returns a square 2D array of pairwise
distances.
normalized (boolean): If true, treat histograms as fractions of the
dataset. If false, treat histograms as counts. In the latter case
the EMD will vary greatly by array length.
range (int min, int max): A tuple of minimum and maximum values for
histogram. Defaults to the range of the union of `first_array`
and `second_array`. Note: if the given range is not a superset
of the default range, no warning will be given.
Returns:
float: The EMD value.
"""
if range is None:
range = (min(np.min(first_array), np.min(second_array)),
max(np.max(first_array), np.max(second_array)))

if isinstance(bins, str):
hist, _ = np.histogram(np.concatenate([first_array,
second_array]),
range=range,
bins=bins)
bins = len(hist)

if distance == 'euclidean':
distance = euclidean_pairwise_distance

# Compute histograms
first_histogram, bin_edges = np.histogram(first_array,
range=range,
bins=bins)
second_histogram, _ = np.histogram(second_array,
range=range,
bins=bins)

first_histogram = first_histogram.astype(np.float64)
second_histogram = second_histogram.astype(np.float64)
if normalized:
# Normalize histograms to represent fraction of dataset in each bin
first_histogram = first_histogram/np.sum(first_histogram)
second_histogram = second_histogram/np.sum(second_histogram)

# Compute the distance matrix between the center of each bin
bin_locations = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0)
distance_matrix = distance(bin_locations)

return emd(first_histogram,
second_histogram,
distance_matrix,
extra_mass_penalty=extra_mass_penalty)


def emd_with_flow(np.ndarray[np.float64_t, ndim=1, mode="c"] first_histogram,
np.ndarray[np.float64_t, ndim=1, mode="c"] second_histogram,
np.ndarray[np.float64_t, ndim=2, mode="c"] distance_matrix,
Expand Down
60 changes: 59 additions & 1 deletion test/test_pyemd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import pytest

from pyemd import emd, emd_with_flow
from pyemd import emd, emd_with_flow, emd_samples


EMD_PRECISION = 5
Expand Down Expand Up @@ -146,6 +146,64 @@ def test_extra_mass_penalty_flow():
[0.0, 0.0, 1.0, 0.0],
[0.0, 0.0, 1.0, 1.0]])


def test_case_samples():
first_array = [1,2,3,4]
second_array = [2,3,4,5]
emd_value = emd_samples(first_array, second_array)
assert round(emd_value, EMD_PRECISION) == 0.75


def test_case_samples_binsize():
first_array = [1,2,3,4]
second_array = [2,3,4,5]
emd_value = emd_samples(first_array, second_array, bins=2)
assert round(emd_value, EMD_PRECISION) == 0.5


def test_case_samples_manual_range():
first_array = [1,2,3,4]
second_array = [2,3,4,5]
emd_value = emd_samples(first_array, second_array, range=(0,10))
assert round(emd_value, EMD_PRECISION) == 1.0


def test_case_samples_not_normalized():
first_array = [1,2,3,4]
second_array = [2,3,4,5]
emd_value = emd_samples(first_array, second_array, normalized=False)
assert round(emd_value, EMD_PRECISION) == 3.0


def test_case_samples_custom_distance():
dist = lambda x : np.array([[0. if i == j else 1. for i in x] for j in x])
first_array = [1,2,3,4]
second_array = [2,3,4,5]
emd_value = emd_samples(first_array, second_array, distance=dist)
assert round(emd_value, EMD_PRECISION) == 0.25


def test_case_samples_2():
first_array = [1]
second_array = [2]
emd_value = emd_samples(first_array, second_array)
assert round(emd_value, EMD_PRECISION) == 0.5


def test_case_samples_3():
first_array = [1,1,1,2,3]
second_array = [1,2,2,2,3]
emd_value = emd_samples(first_array, second_array)
assert round(emd_value, EMD_PRECISION) == 0.32


def test_case_samples_4():
first_array = [1,2,3,4,5]
second_array = [99,98,97,96,95]
emd_value = emd_samples(first_array, second_array)
assert round(emd_value, EMD_PRECISION) == 78.4


# Validation testing
# ~~~~~~~~~~~~~~~~~~

Expand Down

0 comments on commit 56ea285

Please sign in to comment.