Skip to content

Commit

Permalink
Implemented two new feature calculator: Lempel Ziv complexity and Fou…
Browse files Browse the repository at this point in the history
…rier Entropy (#688)

* Implemented a new feature calculator: Lempel Ziv complexity

* Added to changelog

* Added an additional parameter and fixed a typo

* pep8ify

* Implemented a new feature calculator: Fourier Entropy

* Added to changelog

Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
  • Loading branch information
nils-braun and nils-braun authored May 14, 2020
1 parent b73cd09 commit 8dd3f72
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 1 deletion.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ Changelog

tsfresh uses `Semantic Versioning <http://semver.org/>`_

Unreleased
==========
- Added Features
- Implemented the Lempel-Ziv-Complexity and the Fourier Entropy (#688)

Version 0.16.0
==============

Expand Down
32 changes: 32 additions & 0 deletions tests/units/feature_extraction/test_feature_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,38 @@ def test_cid_ce(self):
self.assertEqualOnAllArrayTypes(cid_ce, [0.5, 3.5, 7.5], 5, normalize=False)
self.assertEqualOnAllArrayTypes(cid_ce, [-4.33, -1.33, 2.67], 5, normalize=False)

def test_lempel_ziv_complexity(self):
self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=2)
self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=5)

self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 1, 1, 1, 1],
0.4285714285, bins=2)
self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 2, 1, 1, 1],
0.5714285714, bins=2)

self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity,
[-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
0.8, bins=10)
self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity,
[-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
0.4, bins=10)

def test_fourier_entropy(self):
self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=2)
self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=5)

self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 2, 1, 1, 1, 1],
0.5623351446188083, bins=5)
self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 1, 1, 2, 1, 1],
1.0397207708399179, bins=5)

self.assertAlmostEqualOnAllArrayTypes(fourier_entropy,
[-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
1.5607104090414063, bins=10)
self.assertIsNanOnAllArrayTypes(fourier_entropy,
[-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
bins=10)

def test_ratio_beyond_r_sigma(self):

x = [0, 1] * 10 + [10, 20, -30] # std of x is 7.21, mean 3.04
Expand Down
60 changes: 60 additions & 0 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,61 @@ def cid_ce(x, normalize):
return np.sqrt(np.dot(x, x))


@set_property("fctype", "simple")
def fourier_entropy(x, bins):
"""
Calculate the binned entropy of the power spectral density of the time series
(using the welch method).
Ref: https://hackaday.io/project/707-complexity-of-a-time-series/details
Ref: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.signal.welch.html
"""
_, pxx = welch(x, nperseg=min(len(x), 256))
return binned_entropy(pxx / np.max(pxx), bins)


@set_property("fctype", "simple")
def lempel_ziv_complexity(x, bins):
"""
Calculate a complexity estimate based on the Lempel-Ziv compression
algorithm.
The complexity is defined as the number of dictionary entries (or sub-words) needed
to encode the time series when viewed from left to right.
FOr this, the time series is first binned into the given number of bins.
Then it is converted into sub-words with different prefixes.
The number of sub-words needed for this divided by the length of the time
series is the complexity estimate.
For example, if the time series (after binning in only 2 bins) would look like "100111",
the different sub-words would be 1, 0, 01 and 11 and therefore the result is 4/6 = 0.66.
Ref: https://github.com/Naereen/Lempel-Ziv_Complexity/blob/master/src/lempel_ziv_complexity.py
"""
x = np.asarray(x)

bins = np.linspace(np.min(x), np.max(x), bins)
sequence = np.searchsorted(bins, x, side='left')

sub_strings = set()
n = len(sequence)

ind = 0
inc = 1
while ind + inc <= n:
# convert tu tuple to make it hashable
sub_str = tuple(sequence[ind:ind + inc])
if sub_str in sub_strings:
inc += 1
else:
sub_strings.add(sub_str)
ind += inc
inc = 1
return len(sub_strings) / n


@set_property("fctype", "simple")
def mean_abs_change(x):
"""
Expand Down Expand Up @@ -1512,6 +1567,11 @@ def binned_entropy(x, max_bins):
"""
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)

# nan makes no sense here
if np.isnan(x).any():
return np.nan

hist, bin_edges = np.histogram(x, bins=max_bins)
probs = hist / x.size
probs[probs == 0] = 1.0
Expand Down
4 changes: 3 additions & 1 deletion tsfresh/feature_extraction/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ def __init__(self):
"linear_trend_timewise": [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"},
{"attr": "slope"}, {"attr": "stderr"}],
"count_above": [{"t": 0}],
"count_below": [{"t": 0}]
"count_below": [{"t": 0}],
"lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
"fourier_entropy": [{"bins": x} for x in [2, 3, 5, 10, 100]],

})

Expand Down

0 comments on commit 8dd3f72

Please sign in to comment.