Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft: Cythonize #33

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions similaritymeasures/cy_similaritymeasures/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
'''
just a quick benchmaerking script to showcase cythonized vs
non cythonized performances
'''

import timeit
import numpy as np
from similaritymeasures import frechet_dist, dtw
from cy_similaritymeasures import frechet_dist as cy_frechet_dist
from cy_similaritymeasures import dtw as cy_dtw

#generating data in a way similar to the tests
x1 = np.linspace(0.0, 1.0, 100)
y1 = np.ones(100)*2
x2 = np.linspace(0.0, 1.0, 50)
y2 = np.ones(50)

np.random.seed(1212121)
curve_a_rand = np.random.random((100, 2))
curve_b_rand = np.random.random((90, 2))

curve1 = np.array((x1, y1)).T
curve2 = np.array((x2, y2)).T

res = frechet_dist(curve1, curve2)

cy_res = cy_frechet_dist(curve1, curve2)

print(res)
print(cy_res)
print(res == cy_res)

def run_frechet():
return frechet_dist(curve1, curve2)

def run_cy_frechet():
return cy_frechet_dist(curve1, curve2)

n_repeats = 5
n_runs = 200

t1 = timeit.repeat(run_frechet, repeat = n_repeats, number = n_runs)

t2 = timeit.repeat(run_cy_frechet, repeat = n_repeats, number = n_runs)
print(t1)
print(t2)

avg = sum(t1) / len(t1)
cy_avg = sum(t2) / len(t2)
improv_pct = (avg - cy_avg) / avg * 100
print('average execution time for non cythonized version: %f' % avg)
print('average execution time for cythonized version: %f' % cy_avg)
print('improvement: %d %%' % improv_pct)

#then i would benchmark dtw

res, _ = dtw(curve1, curve2)

cy_res, _ = cy_dtw(curve1, curve2)

print(res)
print(cy_res)
print(res == cy_res)

def run_dtw():
return dtw(curve1, curve2)

def run_cy_dtw():
return cy_dtw(curve1, curve2)

t1 = timeit.repeat(run_dtw, repeat = n_repeats, number = n_runs)

t2 = timeit.repeat(run_cy_dtw, repeat = n_repeats, number = n_runs)
print(t1)
print(t2)

avg = sum(t1) / len(t1)
cy_avg = sum(t2) / len(t2)
improv_pct = (avg - cy_avg) / avg * 100
print('average execution time for non cythonized version: %f' % avg)
print('average execution time for cythonized version: %f' % cy_avg)
print('improvement: %d %%' % improv_pct)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3 setup.py build_ext --inplace
212 changes: 212 additions & 0 deletions similaritymeasures/cy_similaritymeasures/cy_similaritymeasures.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import numpy as np
from scipy.spatial import distance

cdef double c_max(double a, double b):
if a > b:
return a
return b

cdef double c_min(double a, double b):
if a < b:
return a
return b

def frechet_dist(exp_data, num_data, p=2):
r"""
Compute the discrete Frechet distance

Compute the Discrete Frechet Distance between two N-D curves according to
[1]_. The Frechet distance has been defined as the walking dog problem.
From Wikipedia: "In mathematics, the Frechet distance is a measure of
similarity between curves that takes into account the location and
ordering of the points along the curves. It is named after Maurice Frechet.
https://en.wikipedia.org/wiki/Fr%C3%A9chet_distance

Parameters
----------
exp_data : array_like
Curve from your experimental data. exp_data is of (M, N) shape, where
M is the number of data points, and N is the number of dimmensions
num_data : array_like
Curve from your numerical data. num_data is of (P, N) shape, where P
is the number of data points, and N is the number of dimmensions
p : float, 1 <= p <= infinity
Which Minkowski p-norm to use. Default is p=2 (Eculidean).
The manhattan distance is p=1.

Returns
-------
df : float
discrete Frechet distance

References
----------
.. [1] Thomas Eiter and Heikki Mannila. Computing discrete Frechet
distance. Technical report, 1994.
http://www.kr.tuwien.ac.at/staff/eiter/et-archive/cdtr9464.pdf
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.937&rep=rep1&type=pdf

Notes
-----
Your x locations of data points should be exp_data[:, 0], and the y
locations of the data points should be exp_data[:, 1]. Same for num_data.

Thanks to Arbel Amir for the issue, and Sen ZHANG for the iterative code
https://github.com/cjekel/similarity_measures/issues/6

Examples
--------
>>> # Generate random experimental data
>>> x = np.random.random(100)
>>> y = np.random.random(100)
>>> exp_data = np.zeros((100, 2))
>>> exp_data[:, 0] = x
>>> exp_data[:, 1] = y
>>> # Generate random numerical data
>>> x = np.random.random(100)
>>> y = np.random.random(100)
>>> num_data = np.zeros((100, 2))
>>> num_data[:, 0] = x
>>> num_data[:, 1] = y
>>> df = frechet_dist(exp_data, num_data)

"""
cdef int i, j
cdef int n = len(exp_data)
cdef int m = len(num_data)
c = distance.cdist(exp_data, num_data, metric='minkowski', p=p)
#ca = np.ones((n, m))
#ca = np.multiply(ca, -1)
#ca[0, 0] = c[0, 0]
#for i in range(1, n):
# ca[i, 0] = max(ca[i-1, 0], c[i, 0])
#for j in range(1, m):
# ca[0, j] = max(ca[0, j-1], c[0, j])
for i in range(1, n):
for j in range(1, m):
c[i, j] = c_max(
c_min(c_min(c[i-1, j], c[i, j-1]) , c[i-1, j-1]),
c[i, j])
return c[n-1, m-1]

def dtw(exp_data, num_data, metric='euclidean', **kwargs):
r"""
Compute the Dynamic Time Warping distance.

This computes a generic Dynamic Time Warping (DTW) distance and follows
the algorithm from [1]_. This can use all distance metrics that are
available in scipy.spatial.distance.cdist.

Parameters
----------
exp_data : array_like
Curve from your experimental data. exp_data is of (M, N) shape, where
M is the number of data points, and N is the number of dimmensions
num_data : array_like
Curve from your numerical data. num_data is of (P, N) shape, where P
is the number of data points, and N is the number of dimmensions
metric : str or callable, optional
The distance metric to use. Default='euclidean'. Refer to the
documentation for scipy.spatial.distance.cdist. Some examples:
'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'wminkowski', 'yule'.
**kwargs : dict, optional
Extra arguments to `metric`: refer to each metric documentation in
scipy.spatial.distance.
Some examples:

p : scalar
The p-norm to apply for Minkowski, weighted and unweighted.
Default: 2.

w : ndarray
The weight vector for metrics that support weights (e.g.,
Minkowski).

V : ndarray
The variance vector for standardized Euclidean.
Default: var(vstack([XA, XB]), axis=0, ddof=1)

VI : ndarray
The inverse of the covariance matrix for Mahalanobis.
Default: inv(cov(vstack([XA, XB].T))).T

out : ndarray
The output array
If not None, the distance matrix Y is stored in this array.

Returns
-------
r : float
DTW distance.
d : ndarray (2-D)
Cumulative distance matrix

Notes
-----
The DTW distance is d[-1, -1].

This has O(M, P) computational cost.

The latest scipy.spatial.distance.cdist information can be found at
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html

Your x locations of data points should be exp_data[:, 0], and the y
locations of the data points should be exp_data[:, 1]. Same for num_data.

This uses the euclidean distance for now. In the future it should be
possible to support other metrics.

DTW is a non-metric distance, which means DTW doesn't hold the triangle
inequality.
https://en.wikipedia.org/wiki/Triangle_inequality

References
----------
.. [1] Senin, P., 2008. Dynamic time warping algorithm review. Information
and Computer Science Department University of Hawaii at Manoa Honolulu,
USA, 855, pp.1-23.
http://seninp.github.io/assets/pubs/senin_dtw_litreview_2008.pdf

Examples
--------
>>> # Generate random experimental data
>>> x = np.random.random(100)
>>> y = np.random.random(100)
>>> exp_data = np.zeros((100, 2))
>>> exp_data[:, 0] = x
>>> exp_data[:, 1] = y
>>> # Generate random numerical data
>>> x = np.random.random(100)
>>> y = np.random.random(100)
>>> num_data = np.zeros((100, 2))
>>> num_data[:, 0] = x
>>> num_data[:, 1] = y
>>> r, d = dtw(exp_data, num_data)

The euclidean distance is used by default. You can use metric and **kwargs
to specify different types of distance metrics. The following example uses
the city block or Manhattan distance between points.

>>> r, d = dtw(exp_data, num_data, metric='cityblock')

"""
cdef int i, j
cdef int n = len(exp_data)
cdef int m = len(num_data)
c = distance.cdist(exp_data, num_data, metric=metric, **kwargs)

d = np.zeros(c.shape)
d[0, 0] = c[0, 0]
#n, m = c.shape
for i in range(1, n):
d[i, 0] = d[i-1, 0] + c[i, 0]
for j in range(1, m):
d[0, j] = d[0, j-1] + c[0, j]
for i in range(1, n):
for j in range(1, m):
d[i, j] = c[i, j] + c_min(c_min(d[i-1, j], d[i, j-1]), d[i-1, j-1])
return d[-1, -1], d
26 changes: 26 additions & 0 deletions similaritymeasures/cy_similaritymeasures/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# cy_similaritymeasures

this folder shall contain stuff to cythonize the similaritymeasures functions

```
benchmark.py #shall be the script containg a quick performance check, to be executed at last after the cythonized version was correctly built
build_cython_extension.sh #to invoke the build from the setup.py file a command is required, and for linux such command is written in this file
cy_similaritymeasures.pyx #the cython code with the cythonized version of the frechet function
setup.py #a setup.py file used to build the cython code and obtain an extension from it
```

## extension building

cython code from the `*.pyx` file is not directly invokable from python, instead it should be built and transformed into an extension. To do that there is the script `build_cython_extension.sh`, which invokes `setup.py` to obtain a `.cpp` and `.so` extension file to be invoked from python.

You should have cython on your machine in order to build everything.

## results

to run the benchmark just run the `benchmark.py`. On my machine the results were:

```
average execution time for non cythonized version: 0.744819
average execution time for cythonized version: 0.433768
improvement: 41 %
```
17 changes: 17 additions & 0 deletions similaritymeasures/cy_similaritymeasures/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
knowing this may be a bad thing, but i will place here a
quick setup.py to just
'''

from setuptools import setup, Extension
from Cython.Build import cythonize

setup(
name = "cy_frechet",
ext_modules = cythonize(Extension(
"cy_similaritymeasures",
sources=["cy_similaritymeasures.pyx"],
language="c++"
)
))