Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of matrix_profile feature #793

Merged
merged 14 commits into from
Jan 25, 2021
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ scikit-learn>=0.19.2
tqdm>=4.10.0
dask[dataframe]>=2.9.0
distributed>=2.11.0
matrixprofile>=1.1.7<2.0.0
25 changes: 25 additions & 0 deletions tests/units/feature_extraction/test_feature_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,31 @@ def test_benford_correlation(self):
self.assertAlmostEqual(benford_correlation(list_with_nan), 0.10357511)
self.assertIsNaN(benford_correlation(equal_list))

def test_matrix_profile_window(self):
#Test matrix profile output with specified window
np.random.seed(9999)
ts = np.random.uniform(size=2**10)
w = 2**5
subq = ts[0:w]
ts[0:w] = subq
ts[w+100:w+100+w] = subq
self.assertAlmostEqual(matrix_profile(ts,windows=36)[0],2.826)
self.assertAlmostEqual(matrix_profile(ts,windows=36)[1],3.514)
self.assertAlmostEqual(matrix_profile(ts,windows=36)[2],3.626)

def test_matrix_profile_no_window(self):
np.random.seed(9999)
ts = np.random.uniform(size=2**10)
w = 2**5
subq = ts[0:w]
ts[0:w] = subq
ts[w+100:w+100+w] = subq

#Test matrix profile output with no window specified
self.assertAlmostEqual(matrix_profile(ts)[0],2.826)
self.assertAlmostEqual(matrix_profile(ts)[1],3.514)
self.assertAlmostEqual(matrix_profile(ts)[2],3.626)

nils-braun marked this conversation as resolved.
Show resolved Hide resolved

class FriedrichTestCase(TestCase):

Expand Down
69 changes: 69 additions & 0 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import itertools
import functools
from tsfresh.utilities.string_manipulation import convert_to_output_format
import warnings
from builtins import range
from collections import defaultdict
Expand All @@ -29,6 +30,7 @@
from scipy.signal import cwt, find_peaks_cwt, ricker, welch
from scipy.stats import linregress
from statsmodels.tools.sm_exceptions import MissingDataError
import matrixprofile as mp

with warnings.catch_warnings():
# Ignore warnings of the patsy package
Expand Down Expand Up @@ -2212,3 +2214,70 @@ def benford_correlation(x):
# np.corrcoef outputs the normalized covariance (correlation) between benford_distribution and data_distribution.
# In this case returns a 2x2 matrix, the [0, 1] and [1, 1] are the values between the two arrays
return np.corrcoef(benford_distribution, data_distribution)[0, 1]


@set_property("fctype", "combiner")
def matrix_profile(x, param):
"""
TODO: Documentation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to mention, documentation is still missing :-)

:param x: the time series to calculate the feature of
:type x: numpy.ndarray
:param param: contains dictionaries {"sample_pct": x, "threshold": y, "feature": z}
with sample_pct and threshold being parameters of the matrixprofile
package https://matrixprofile.docs.matrixprofile.org/api.html#matrixprofile-compute
and feature being one of "min", "max", "mean", "median", "25", "75"
and decides which feature of the matrix profile to extract
:type param: list
:return: the different feature values
:return type: pandas.Series
"""
x = np.asarray(x)

def _calculate_mp(**kwargs):
"""Calculate the matrix profile using the specified window, or the maximum subsequence if no window is specified"""
try:
if 'windows' in kwargs:
m_p = mp.compute(x,**kwargs)
else:
m_p = mp.algorithms.maximum_subsequence(x, include_pmp=True,**kwargs)['pmp'][-1]
return m_p

except Exception:
return [np.NaN]

# The already calculated matrix profiles
matrix_profiles = {}

# The results
res = {}

for kwargs in param:
key = convert_to_output_format(kwargs)
feature = kwargs.pop('feature')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, that took some time for me to debug! And unfortunately I think it was me introducing the bug in one of my previous commits :-/
The problem is a bit complicated to describe, so here is the short version:
the parameters you are using here come from the settings object given to the extract_features function. Due to reference/pointer magic happening in python, the kwargs you are using here is actually the exact one stored in the settings object. If you now use these setings twice in the same test (which all of those failed tests do), you actually remove features from the original settings object and will not be present the next time :-)
So, simple fix: add kwargs = kwargs.copy() before that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries! I've updated the code to reflect this :)


# Only calculate the pmp if we have not already done so
# The feature calculation can happen afterwards
featureless_key = convert_to_output_format(kwargs)
if featureless_key not in matrix_profiles:
matrix_profiles[featureless_key] = _calculate_mp(**kwargs)

m_p = matrix_profiles[featureless_key]
finite_indices = np.isfinite(m_p)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you can find the finite indices and store them for functions that do not work on non-finite data.

finite_indices = np.finite(m_p)


if feature == "min":
res[key] = np.min(m_p[finite_indices])
elif feature == "max":
res[key] = np.max(m_p[finite_indices])
elif feature == "mean":
res[key] = np.mean(m_p[finite_indices])
elif feature == "median":
res[key] = np.median(m_p[finite_indices])
elif feature == "25":
res[key] = np.percentile(m_p[finite_indices], 25)
elif feature == "75":
res[key] = np.percentile(m_p[finite_indices], 75)
else:
raise ValueError(f"Unknown feature {feature} for the matrix profile")

return [(key, value) for key, value in res.items()]
1 change: 1 addition & 0 deletions tsfresh/feature_extraction/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def __init__(self):
"lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
"fourier_entropy": [{"bins": x} for x in [2, 3, 5, 10, 100]],
"permutation_entropy": [{"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7]],
"matrix_profile": [{"threshold": 0.98, "feature": f} for f in ["min", "max", "mean", "median", "25", "75"]]

})

Expand Down