diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py index 5ea74bbaf..a77ef5be6 100644 --- a/tests/units/feature_extraction/test_feature_calculations.py +++ b/tests/units/feature_extraction/test_feature_calculations.py @@ -23,10 +23,10 @@ def assertEqualOnAllArrayTypes(self, f, input_to_f, result, *args, **kwargs): msg="Not equal for lists: {} != {}".format(f(input_to_f, *args, **kwargs), result)) self.assertEqual(f(np.array(input_to_f), *args, **kwargs), result, msg="Not equal for numpy.arrays: {} != {}".format( - f(np.array(input_to_f), *args, **kwargs), result)) + f(np.array(input_to_f), *args, **kwargs), result)) self.assertEqual(f(pd.Series(input_to_f), *args, **kwargs), result, msg="Not equal for pandas.Series: {} != {}".format( - f(pd.Series(input_to_f), *args, **kwargs), result)) + f(pd.Series(input_to_f), *args, **kwargs), result)) def assertTrueOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): self.assertTrue(f(input_to_f, *args, **kwargs), msg="Not true for lists") @@ -323,8 +323,8 @@ def test_cid_ce(self): self.assertEqualOnAllArrayTypes(cid_ce, [-4.33, -1.33, 2.67], 5, normalize=False) def test_lempel_ziv_complexity(self): - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=2) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=5) + self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2. / 3, bins=2) + self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2. / 3, bins=5) self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 1, 1, 1, 1], 0.4285714285, bins=2) @@ -403,7 +403,7 @@ def test_standard_deviation(self): self.assertIsNanOnAllArrayTypes(standard_deviation, []) def test_variation_coefficient(self): - self.assertIsNanOnAllArrayTypes(variation_coefficient, [1, 1, -1, -1],) + self.assertIsNanOnAllArrayTypes(variation_coefficient, [1, 1, -1, -1], ) self.assertAlmostEqualOnAllArrayTypes(variation_coefficient, [1, 2, -3, -1], -7.681145747868608) self.assertAlmostEqualOnAllArrayTypes(variation_coefficient, [1, 2, 4, -1], 1.2018504251546631) self.assertIsNanOnAllArrayTypes(variation_coefficient, []) @@ -606,6 +606,7 @@ def test_fft_aggregated(self): # Gaussian test: def normal(y, mean_, sigma_): return 1 / (2 * np.pi * sigma_ ** 2) * np.exp(-(y - mean_) ** 2 / (2 * sigma_ ** 2)) + mean_ = 500. sigma_ = 1. range_ = int(2 * mean_) @@ -615,7 +616,7 @@ def normal(y, mean_, sigma_): # Hand calculated values of centroid and variance based for the half-normal dist: # (Ref: https://en.wikipedia.org/wiki/Half-normal_distribution) expected_fft_centroid = (range_ / (2 * np.pi * sigma_)) * np.sqrt(2 / np.pi) - expected_fft_var = (range_ / (2 * np.pi * sigma_))**2 * (1 - 2 / np.pi) + expected_fft_var = (range_ / (2 * np.pi * sigma_)) ** 2 * (1 - 2 / np.pi) # Calculate values for unit test: res = pd.Series(dict(fft_aggregated(x, param))) @@ -1221,26 +1222,47 @@ def test_count_above(self): self.assertEqualPandasSeriesWrapper(count_above, [1] * 10, 1, t=1) self.assertEqualPandasSeriesWrapper(count_above, list(range(10)), 1, t=0) self.assertEqualPandasSeriesWrapper(count_above, list(range(10)), 0.5, t=5) - self.assertEqualPandasSeriesWrapper(count_above, [0.1, 0.2, 0.3] * 3, 2/3, t=0.2) - self.assertEqualPandasSeriesWrapper(count_above, [np.NaN, 0, 1] * 3, 2/3, t=0) - self.assertEqualPandasSeriesWrapper(count_above, [np.NINF, 0, 1] * 3, 2/3, t=0) + self.assertEqualPandasSeriesWrapper(count_above, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2) + self.assertEqualPandasSeriesWrapper(count_above, [np.NaN, 0, 1] * 3, 2 / 3, t=0) + self.assertEqualPandasSeriesWrapper(count_above, [np.NINF, 0, 1] * 3, 2 / 3, t=0) self.assertEqualPandasSeriesWrapper(count_above, [np.PINF, 0, 1] * 3, 1, t=0) self.assertEqualPandasSeriesWrapper(count_above, [np.NaN, 0, 1] * 3, 0, t=np.NaN) self.assertEqualPandasSeriesWrapper(count_above, [np.NINF, 0, np.PINF] * 3, 1, t=np.NINF) - self.assertEqualPandasSeriesWrapper(count_above, [np.PINF, 0, 1] * 3, 1/3, t=np.PINF) + self.assertEqualPandasSeriesWrapper(count_above, [np.PINF, 0, 1] * 3, 1 / 3, t=np.PINF) def test_count_below(self): self.assertEqualPandasSeriesWrapper(count_below, [1] * 10, 1, t=1) - self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 1/10, t=0) - self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 6/10, t=5) - self.assertEqualPandasSeriesWrapper(count_below, [0.1, 0.2, 0.3] * 3, 2/3, t=0.2) - self.assertEqualPandasSeriesWrapper(count_below, [np.NaN, 0, 1] * 3, 1/3, t=0) - self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, 1] * 3, 2/3, t=0) - self.assertEqualPandasSeriesWrapper(count_below, [np.PINF, 0, 1] * 3, 1/3, t=0) + self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 1 / 10, t=0) + self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 6 / 10, t=5) + self.assertEqualPandasSeriesWrapper(count_below, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2) + self.assertEqualPandasSeriesWrapper(count_below, [np.NaN, 0, 1] * 3, 1 / 3, t=0) + self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, 1] * 3, 2 / 3, t=0) + self.assertEqualPandasSeriesWrapper(count_below, [np.PINF, 0, 1] * 3, 1 / 3, t=0) self.assertEqualPandasSeriesWrapper(count_below, [np.NaN, 0, 1] * 3, 0, t=np.NaN) - self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, np.PINF] * 3, 1/3, t=np.NINF) + self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, np.PINF] * 3, 1 / 3, t=np.NINF) self.assertEqualPandasSeriesWrapper(count_below, [np.PINF, 0, 1] * 3, 1, t=np.PINF) + def test_benford_correlation(self): + # A test with list of random values + np.random.seed(42) + random_list = np.random.uniform(size=100) + + # Fibonacci series is known to match the Newcomb-Benford's Distribution + fibonacci_list = [0, 1] + for i in range(2, 200): + fibonacci_list.append(fibonacci_list[i - 1] + fibonacci_list[i - 2]) + + # A list of equally distributed digits (returns NaN) + equal_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] + + # A list containing NaN + list_with_nan = [1.354, 0.058, 0.055, 0.99, 3.15, np.nan, 0.3, 2.3, 0, 0.59, 0.74] + + self.assertAlmostEqual(benford_correlation(random_list), 0.39458056) + self.assertAlmostEqual(benford_correlation(fibonacci_list), 0.998003988) + self.assertAlmostEqual(benford_correlation(list_with_nan), 0.10357511) + self.assertIsNaN(benford_correlation(equal_list)) + class FriedrichTestCase(TestCase): diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index 46d8367fa..30dbb7e51 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -2083,3 +2083,44 @@ def count_below(x, t): :return type: float """ return np.sum(x <= t)/len(x) + + +@set_property("fctype", "simple") +def benford_correlation(x): + """ + Useful for anomaly detection applications [1][2]. Returns the correlation from first digit distribution when + compared to the Newcomb-Benford's Law distribution [3][4]. + + .. math:: + + P(d)=\\log_{10}\\left(1+\\frac{1}{d}\\right) + + where :math:`P(d)` is the Newcomb-Benford distribution for :math:`d` that is the leading digit of the number + {1, 2, 3, 4, 5, 6, 7, 8, 9}. + + .. rubric:: References + + | [1] A Statistical Derivation of the Significant-Digit Law, Theodore P. Hill, Statistical Science, 1995 + | [2] The significant-digit phenomenon, Theodore P. Hill, The American Mathematical Monthly, 1995 + | [3] The law of anomalous numbers, Frank Benford, Proceedings of the American philosophical society, 1938 + | [4] Note on the frequency of use of the different digits in natural numbers, Simon Newcomb, American Journal of + | mathematics, 1881 + + :param x: the time series to calculate the feature of + :type x: numpy.ndarray + :return: the value of this feature + :return type: float + """ + x = np.asarray(x) + + # retrieve first digit from data + x = np.array([int(str(np.format_float_scientific(i))[:1]) for i in np.abs(np.nan_to_num(x))]) + + # benford distribution + benford_distribution = np.array([np.log10(1 + 1/n) for n in range(1, 10)]) + + data_distribution = np.array([(x == n).mean() for n in range(1, 10)]) + + # np.corrcoef outputs the normalized covariance (correlation) between benford_distribution and data_distribution. + # In this case returns a 2x2 matrix, the [0, 1] and [1, 1] are the values between the two arrays + return np.corrcoef(benford_distribution, data_distribution)[0, 1]