blue-yonder · nils-braun · Jan 25, 2021 · Jan 15, 2021 · Jan 16, 2021 · Jan 16, 2021
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ scikit-learn>=0.19.2
 tqdm>=4.10.0
 dask[dataframe]>=2.9.0
 distributed>=2.11.0
+matrixprofile>=1.1.7<2.0.0
diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -1301,6 +1301,31 @@ def test_benford_correlation(self):
         self.assertAlmostEqual(benford_correlation(list_with_nan), 0.10357511)
         self.assertIsNaN(benford_correlation(equal_list))
 
+    def test_matrix_profile_window(self):
+        #Test matrix profile output with specified window
+        np.random.seed(9999)
+        ts = np.random.uniform(size=2**10)
+        w = 2**5
+        subq = ts[0:w]
+        ts[0:w] = subq
+        ts[w+100:w+100+w] = subq
+        self.assertAlmostEqual(matrix_profile(ts,windows=36)[0],2.826)
+        self.assertAlmostEqual(matrix_profile(ts,windows=36)[1],3.514)
+        self.assertAlmostEqual(matrix_profile(ts,windows=36)[2],3.626)
+
+    def test_matrix_profile_no_window(self):
+        np.random.seed(9999)
+        ts = np.random.uniform(size=2**10)
+        w = 2**5
+        subq = ts[0:w]
+        ts[0:w] = subq
+        ts[w+100:w+100+w] = subq
+
+        #Test matrix profile output with no window specified
+        self.assertAlmostEqual(matrix_profile(ts)[0],2.826)
+        self.assertAlmostEqual(matrix_profile(ts)[1],3.514)
+        self.assertAlmostEqual(matrix_profile(ts)[2],3.626)
+
 
 class FriedrichTestCase(TestCase):
 

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -19,6 +19,7 @@
 
 import itertools
 import functools
+from tsfresh.utilities.string_manipulation import convert_to_output_format
 import warnings
 from builtins import range
 from collections import defaultdict
@@ -29,6 +30,7 @@
 from scipy.signal import cwt, find_peaks_cwt, ricker, welch
 from scipy.stats import linregress
 from statsmodels.tools.sm_exceptions import MissingDataError
+import matrixprofile as mp
 
 with warnings.catch_warnings():
     # Ignore warnings of the patsy package
@@ -2212,3 +2214,70 @@ def benford_correlation(x):
     # np.corrcoef outputs the normalized covariance (correlation) between benford_distribution and data_distribution.
     # In this case returns a 2x2 matrix, the  [0, 1] and [1, 1] are the values between the two arrays
     return np.corrcoef(benford_distribution, data_distribution)[0, 1]
+
+
+@set_property("fctype", "combiner")
+def matrix_profile(x, param):
+    """
+    TODO: Documentation
+    :param x: the time series to calculate the feature of
+    :type x: numpy.ndarray
+    :param param: contains dictionaries {"sample_pct": x, "threshold": y, "feature": z}
+    with sample_pct and threshold being parameters of the matrixprofile
+    package https://matrixprofile.docs.matrixprofile.org/api.html#matrixprofile-compute
+    and feature being one of "min", "max", "mean", "median", "25", "75"
+    and decides which feature of the matrix profile to extract
+    :type param: list
+    :return: the different feature values
+    :return type: pandas.Series
+    """
+    x = np.asarray(x)
+
+    def _calculate_mp(**kwargs):
+        """Calculate the matrix profile using the specified window, or the maximum subsequence if no window is specified"""
+        try:
+            if 'windows' in kwargs:
+                m_p = mp.compute(x,**kwargs)
+            else:
+                m_p = mp.algorithms.maximum_subsequence(x, include_pmp=True,**kwargs)['pmp'][-1]
+            return m_p
+
+        except Exception:
+            return [np.NaN]
+
+    # The already calculated matrix profiles
+    matrix_profiles = {}
+
+    # The results
+    res = {}
+
+    for kwargs in param:
+        key = convert_to_output_format(kwargs)
+        feature = kwargs.pop('feature')
+
+        # Only calculate the pmp if we have not already done so
+        # The feature calculation can happen afterwards
+        featureless_key = convert_to_output_format(kwargs)
+        if featureless_key not in matrix_profiles:
+            matrix_profiles[featureless_key] = _calculate_mp(**kwargs)
+
+        m_p = matrix_profiles[featureless_key]
+        finite_indices = np.isfinite(m_p)
+
+
+        if feature == "min":
+            res[key] = np.min(m_p[finite_indices])
+        elif feature == "max":
+            res[key] = np.max(m_p[finite_indices])
+        elif feature == "mean":
+            res[key] = np.mean(m_p[finite_indices])
+        elif feature == "median":
+            res[key] = np.median(m_p[finite_indices])
+        elif feature == "25":
+            res[key] = np.percentile(m_p[finite_indices], 25)
+        elif feature == "75":
+            res[key] = np.percentile(m_p[finite_indices], 75)
+        else:
+            raise ValueError(f"Unknown feature {feature} for the matrix profile")
+
+    return [(key, value) for key, value in res.items()]
diff --git a/tsfresh/feature_extraction/settings.py b/tsfresh/feature_extraction/settings.py
@@ -152,6 +152,7 @@ def __init__(self):
             "lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
             "fourier_entropy":  [{"bins": x} for x in [2, 3, 5, 10, 100]],
             "permutation_entropy":  [{"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7]],
+            "matrix_profile": [{"threshold": 0.98, "feature": f} for f in ["min", "max", "mean", "median", "25", "75"]]
 
         })