diff --git a/docs/conf.py b/docs/conf.py index 1a65b2e7a..d463d6668 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,13 +23,15 @@ # DON'T FORGET: Check the box "Install your project inside a virtualenv using # setup.py install" in the RTD Advanced Settings. import os -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd: import inspect from sphinx import apidoc - __location__ = os.path.join(os.getcwd(), os.path.dirname( - inspect.getfile(inspect.currentframe()))) + __location__ = os.path.join( + os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe())) + ) output_dir = os.path.join(__location__, "../docs/api") module_dir = os.path.join(__location__, "../tsfresh") @@ -40,40 +42,48 @@ # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = '1.6.4' +needs_sphinx = "1.6.4" # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', - 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage', - 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.imgmath', - 'sphinx.ext.napoleon'] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.autosummary", + "sphinx.ext.viewcode", + "sphinx.ext.coverage", + "sphinx.ext.doctest", + "sphinx.ext.ifconfig", + "sphinx.ext.imgmath", + "sphinx.ext.napoleon", +] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. now = datetime.datetime.today() -project = 'tsfresh' -copyright = '2016-{}, Maximilian Christ et al./ Blue Yonder GmbH'.format(now.year) +project = "tsfresh" +copyright = "2016-{}, Maximilian Christ et al./ Blue Yonder GmbH".format(now.year) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '' # Is set by calling `setup.py docs` +version = "" # Is set by calling `setup.py docs` # The full version, including alpha/beta/rc tags. -release = '' # Is set by calling `setup.py docs` +release = "" # Is set by calling `setup.py docs` # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -87,7 +97,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', 'api/tests*'] +exclude_patterns = ["_build", "api/tests*"] # The reST default role (used for this markup: `text`) to use for all documents. # default_role = None @@ -104,7 +114,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = ["tsfresh", "tsfresh.convenience"] @@ -121,14 +131,12 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -html_theme_options = { - "style_nav_header_background": "#51b63c" -} +html_theme_options = {"style_nav_header_background": "#51b63c"} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] @@ -157,13 +165,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # From https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html html_context = { - 'css_files': [ - '_static/theme_override.css', # override wide tables in RTD theme - ], + "css_files": [ + "_static/theme_override.css", # override wide tables in RTD theme + ], } # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, @@ -208,7 +216,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'tsfresh-doc' +htmlhelp_basename = "tsfresh-doc" # -- Options for LaTeX output -------------------------------------------------- @@ -216,10 +224,8 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -227,8 +233,7 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'user_guide.tex', 'tsfresh Documentation', - '', 'manual'), + ("index", "user_guide.tex", "tsfresh Documentation", "", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -252,13 +257,13 @@ # latex_domain_indices = True # -- External mapping ------------------------------------------------------------ -python_version = '.'.join(map(str, sys.version_info[0:2])) +python_version = ".".join(map(str, sys.version_info[0:2])) intersphinx_mapping = { - 'sphinx': ('http://sphinx.pocoo.org', None), - 'python': ('http://docs.python.org/' + python_version, None), - 'matplotlib': ('http://matplotlib.sourceforge.net', None), - 'numpy': ('http://docs.scipy.org/doc/numpy', None), - 'sklearn': ('http://scikit-learn.org/stable', None), - 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None), - 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), + "sphinx": ("http://sphinx.pocoo.org", None), + "python": ("http://docs.python.org/" + python_version, None), + "matplotlib": ("http://matplotlib.sourceforge.net", None), + "numpy": ("http://docs.scipy.org/doc/numpy", None), + "sklearn": ("http://scikit-learn.org/stable", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/stable", None), + "scipy": ("http://docs.scipy.org/doc/scipy/reference/", None), } diff --git a/tests/benchmark.py b/tests/benchmark.py index 4f2a11ddd..183a5f802 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -4,20 +4,27 @@ import numpy as np from tsfresh import extract_features, extract_relevant_features -from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters +from tsfresh.feature_extraction.settings import ( + ComprehensiveFCParameters, + MinimalFCParameters, +) def create_data(time_series_length, num_ids, random_seed=42): np.random.seed(random_seed) - df = pd.concat([ - pd.DataFrame({ - "id": [i] * time_series_length, - "time": range(time_series_length), - "value": np.random.randn(time_series_length).cumsum() - }) - for i in range(num_ids) - ]) + df = pd.concat( + [ + pd.DataFrame( + { + "id": [i] * time_series_length, + "time": range(time_series_length), + "value": np.random.randn(time_series_length).cumsum(), + } + ) + for i in range(num_ids) + ] + ) return df @@ -25,20 +32,39 @@ def create_data(time_series_length, num_ids, random_seed=42): def test_benchmark_small_data(benchmark): df = create_data(5, 200) - benchmark(extract_features, df, column_id="id", column_sort="time", n_jobs=0, - disable_progressbar=True) + benchmark( + extract_features, + df, + column_id="id", + column_sort="time", + n_jobs=0, + disable_progressbar=True, + ) def test_benchmark_large_data(benchmark): df = create_data(500, 20) - benchmark(extract_features, df, column_id="id", column_sort="time", n_jobs=0, - disable_progressbar=True) + benchmark( + extract_features, + df, + column_id="id", + column_sort="time", + n_jobs=0, + disable_progressbar=True, + ) def test_benchmark_with_selection(benchmark): df = create_data(500, 20) y = pd.Series(np.random.choice([0, 1], 20)) - benchmark(extract_relevant_features, df, y, column_id="id", column_sort="time", n_jobs=0, - disable_progressbar=True) + benchmark( + extract_relevant_features, + df, + y, + column_id="id", + column_sort="time", + n_jobs=0, + disable_progressbar=True, + ) diff --git a/tests/fixtures.py b/tests/fixtures.py index 3b3737c1c..e5c05e0ea 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -14,7 +14,7 @@ def warning_free(): """Small helper to surpress all warnings""" with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") yield @@ -28,64 +28,227 @@ class DataTestCase(TestCase): def create_test_data_sample(self): cid = np.repeat([10, 500], 40) ckind = np.repeat(["a", "b", "a", "b"], 20) - csort = [30, 53, 26, 35, 42, 25, 17, 67, 20, 68, 46, 12, 0, 74, 66, 31, 32, - 2, 55, 59, 56, 60, 34, 69, 47, 15, 49, 8, 50, 73, 23, 62, 24, 33, - 22, 70, 3, 38, 28, 75, 39, 36, 64, 13, 72, 52, 40, 16, 58, 29, 63, - 79, 61, 78, 1, 10, 4, 6, 65, 44, 54, 48, 11, 14, 19, 43, 76, 7, - 51, 9, 27, 21, 5, 71, 57, 77, 41, 18, 45, 37] - cval = [11, 9, 67, 45, 30, 58, 62, 19, 56, 29, 0, 27, 36, 43, 33, 2, 24, - 71, 41, 28, 50, 40, 39, 7, 53, 23, 16, 37, 66, 38, 6, 47, 3, 61, - 44, 42, 78, 31, 21, 55, 15, 35, 25, 32, 69, 65, 70, 64, 51, 46, 5, - 77, 26, 73, 76, 75, 72, 74, 10, 57, 4, 14, 68, 22, 18, 52, 54, 60, - 79, 12, 49, 63, 8, 59, 1, 13, 20, 17, 48, 34] + csort = [ + 30, + 53, + 26, + 35, + 42, + 25, + 17, + 67, + 20, + 68, + 46, + 12, + 0, + 74, + 66, + 31, + 32, + 2, + 55, + 59, + 56, + 60, + 34, + 69, + 47, + 15, + 49, + 8, + 50, + 73, + 23, + 62, + 24, + 33, + 22, + 70, + 3, + 38, + 28, + 75, + 39, + 36, + 64, + 13, + 72, + 52, + 40, + 16, + 58, + 29, + 63, + 79, + 61, + 78, + 1, + 10, + 4, + 6, + 65, + 44, + 54, + 48, + 11, + 14, + 19, + 43, + 76, + 7, + 51, + 9, + 27, + 21, + 5, + 71, + 57, + 77, + 41, + 18, + 45, + 37, + ] + cval = [ + 11, + 9, + 67, + 45, + 30, + 58, + 62, + 19, + 56, + 29, + 0, + 27, + 36, + 43, + 33, + 2, + 24, + 71, + 41, + 28, + 50, + 40, + 39, + 7, + 53, + 23, + 16, + 37, + 66, + 38, + 6, + 47, + 3, + 61, + 44, + 42, + 78, + 31, + 21, + 55, + 15, + 35, + 25, + 32, + 69, + 65, + 70, + 64, + 51, + 46, + 5, + 77, + 26, + 73, + 76, + 75, + 72, + 74, + 10, + 57, + 4, + 14, + 68, + 22, + 18, + 52, + 54, + 60, + 79, + 12, + 49, + 63, + 8, + 59, + 1, + 13, + 20, + 17, + 48, + 34, + ] df = pd.DataFrame({"id": cid, "kind": ckind, "sort": csort, "val": cval}) df = df.set_index("id", drop=False) df.index.name = None return df def create_test_data_sample_wide(self): - rec = np.rec.array([ - (0, 10, 0, 11, 50), - (1, 10, 1, 9, 40), - (2, 10, 2, 67, 39), - (3, 10, 3, 45, 7), - (4, 10, 4, 30, 53), - (5, 10, 5, 58, 23), - (6, 10, 6, 62, 16), - (7, 10, 7, 19, 37), - (8, 10, 8, 56, 66), - (9, 10, 9, 29, 38), - (10, 10, 10, 0, 6), - (11, 10, 11, 27, 47), - (12, 10, 12, 36, 3), - (13, 10, 13, 43, 61), - (14, 10, 14, 33, 44), - (15, 10, 15, 2, 42), - (16, 10, 16, 24, 78), - (17, 10, 17, 71, 31), - (18, 10, 18, 41, 21), - (19, 10, 19, 28, 55), - (20, 500, 0, 15, 4), - (21, 500, 1, 35, 14), - (22, 500, 2, 25, 68), - (23, 500, 3, 32, 22), - (24, 500, 4, 69, 18), - (25, 500, 5, 65, 52), - (26, 500, 6, 70, 54), - (27, 500, 7, 64, 60), - (28, 500, 8, 51, 79), - (29, 500, 9, 46, 12), - (30, 500, 10, 5, 49), - (31, 500, 11, 77, 63), - (32, 500, 12, 26, 8), - (33, 500, 13, 73, 59), - (34, 500, 14, 76, 1), - (35, 500, 15, 75, 13), - (36, 500, 16, 72, 20), - (37, 500, 17, 74, 17), - (38, 500, 18, 10, 48), - (39, 500, 19, 57, 34)], - dtype=[('index', ' 2 assert y.dtype == np.object diff --git a/tests/integrations/test_bindings.py b/tests/integrations/test_bindings.py index 92f861c52..66465441f 100644 --- a/tests/integrations/test_bindings.py +++ b/tests/integrations/test_bindings.py @@ -9,18 +9,26 @@ class DaskBindingsTestCase(TestCase): def test_feature_extraction(self): - df = pd.DataFrame({"my_id": [1, 1, 1, 2, 2, 2], "my_kind": ["a"]*6, - "my_value": [1, 2, 3, 4, 5, 6]}) + df = pd.DataFrame( + { + "my_id": [1, 1, 1, 2, 2, 2], + "my_kind": ["a"] * 6, + "my_value": [1, 2, 3, 4, 5, 6], + } + ) df = dd.from_pandas(df, chunksize=3) df_grouped = df.groupby(["my_id", "my_kind"]) - features = dask_feature_extraction_on_chunk(df_grouped, column_id="my_id", - column_kind="my_kind", - column_value="my_value", - column_sort=None, - default_fc_parameters=MinimalFCParameters()) + features = dask_feature_extraction_on_chunk( + df_grouped, + column_id="my_id", + column_kind="my_kind", + column_value="my_value", + column_sort=None, + default_fc_parameters=MinimalFCParameters(), + ) features = features.compute() diff --git a/tests/integrations/test_feature_extraction.py b/tests/integrations/test_feature_extraction.py index 1c820c343..1ffa482ff 100644 --- a/tests/integrations/test_feature_extraction.py +++ b/tests/integrations/test_feature_extraction.py @@ -15,7 +15,7 @@ class FeatureExtractionTestCase(TestCase): def setUp(self): df, y = load_driftbif(100, 10, classification=True, seed=42) - df['my_id'] = df['id'].astype('str') + df["my_id"] = df["id"].astype("str") del df["id"] self.df = df @@ -24,29 +24,47 @@ def test_pandas(self): df = self.df # Test shape and a single entry (to see if it works at all) - X = extract_features(df, column_id="my_id", column_sort="time", column_kind="dimension", column_value="value", - default_fc_parameters=MinimalFCParameters()) + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + column_value="value", + default_fc_parameters=MinimalFCParameters(), + ) self.assertIn("1__mean", X.columns) self.assertAlmostEqual(X.loc["5", "1__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 20)) - X = extract_features(df, column_id="my_id", column_sort="time", column_kind="dimension", - default_fc_parameters=MinimalFCParameters()) + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + default_fc_parameters=MinimalFCParameters(), + ) self.assertIn("1__mean", X.columns) self.assertAlmostEqual(X.loc["5", "1__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 20)) - X = extract_features(df.drop(columns=["dimension"]), column_id="my_id", column_sort="time", - default_fc_parameters=MinimalFCParameters()) + X = extract_features( + df.drop(columns=["dimension"]), + column_id="my_id", + column_sort="time", + default_fc_parameters=MinimalFCParameters(), + ) self.assertIn("value__mean", X.columns) self.assertAlmostEqual(X.loc["5", "value__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 10)) - X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id", - default_fc_parameters=MinimalFCParameters()) + X = extract_features( + df.drop(columns=["dimension", "time"]), + column_id="my_id", + default_fc_parameters=MinimalFCParameters(), + ) self.assertIn("value__mean", X.columns) self.assertAlmostEqual(X.loc["5", "value__mean"], 5.516e-05, 4) self.assertIn("11", X.index) @@ -55,70 +73,116 @@ def test_pandas(self): def test_pandas_no_pivot(self): df = self.df - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", column_value="value", - pivot=False, - default_fc_parameters=MinimalFCParameters()) + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + column_value="value", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("1__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*20, 3)) - - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", - pivot=False, - default_fc_parameters=MinimalFCParameters()) + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 20, 3)) + + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("1__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*20, 3)) - - X = extract_features(df.drop(columns=["dimension"]), column_id="my_id", - column_sort="time", - pivot=False, - default_fc_parameters=MinimalFCParameters()) + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 20, 3)) + + X = extract_features( + df.drop(columns=["dimension"]), + column_id="my_id", + column_sort="time", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("value__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*10, 3)) - - X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id", - pivot=False, - default_fc_parameters=MinimalFCParameters()) + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 10, 3)) + + X = extract_features( + df.drop(columns=["dimension", "time"]), + column_id="my_id", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("value__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*10, 3)) + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 10, 3)) def test_dask(self): df = dd.from_pandas(self.df, npartitions=1) - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", column_value="value", - default_fc_parameters=MinimalFCParameters()).compute() + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + column_value="value", + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("1__mean", X.columns) self.assertAlmostEqual(X.loc["5", "1__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 20)) - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", - default_fc_parameters=MinimalFCParameters()).compute() + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("1__mean", X.columns) self.assertAlmostEqual(X.loc["5", "1__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 20)) - X = extract_features(df.drop(columns=["dimension"]), column_id="my_id", - column_sort="time", - default_fc_parameters=MinimalFCParameters()).compute() + X = extract_features( + df.drop(columns=["dimension"]), + column_id="my_id", + column_sort="time", + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("value__mean", X.columns) self.assertAlmostEqual(X.loc["5", "value__mean"], 5.516e-05, 4) self.assertIn("11", X.index) self.assertEqual(X.shape, (100, 10)) - X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id", - default_fc_parameters=MinimalFCParameters()).compute() + X = extract_features( + df.drop(columns=["dimension", "time"]), + column_id="my_id", + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("value__mean", X.columns) self.assertAlmostEqual(X.loc["5", "value__mean"], 5.516e-05, 4) self.assertIn("11", X.index) @@ -127,33 +191,64 @@ def test_dask(self): def test_dask_no_pivot(self): df = dd.from_pandas(self.df, npartitions=1) - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", column_value="value", - pivot=False, - default_fc_parameters=MinimalFCParameters()).compute() + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + column_value="value", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("1__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*20, 3)) - - X = extract_features(df, column_id="my_id", column_sort="time", - column_kind="dimension", - pivot=False, - default_fc_parameters=MinimalFCParameters()).compute() + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 20, 3)) + + X = extract_features( + df, + column_id="my_id", + column_sort="time", + column_kind="dimension", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("1__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*20, 3)) - - X = extract_features(df.drop(columns=["dimension"]), column_id="my_id", - column_sort="time", - pivot=False, - default_fc_parameters=MinimalFCParameters()).compute() + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 20, 3)) + + X = extract_features( + df.drop(columns=["dimension"]), + column_id="my_id", + column_sort="time", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("value__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*10, 3)) - - X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id", - pivot=False, - default_fc_parameters=MinimalFCParameters()).compute() + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 10, 3)) + + X = extract_features( + df.drop(columns=["dimension", "time"]), + column_id="my_id", + pivot=False, + default_fc_parameters=MinimalFCParameters(), + ).compute() self.assertIn("value__mean", X["variable"].values) - self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) - self.assertEqual(X.shape, (100*10, 3)) + self.assertAlmostEqual( + X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], + 5.516e-05, + 4, + ) + self.assertEqual(X.shape, (100 * 10, 3)) diff --git a/tests/integrations/test_full_pipeline.py b/tests/integrations/test_full_pipeline.py index 5a42cf4bc..60dd017c6 100644 --- a/tests/integrations/test_full_pipeline.py +++ b/tests/integrations/test_full_pipeline.py @@ -10,7 +10,10 @@ import pandas as pd from sklearn.pipeline import Pipeline -from tsfresh.examples.robot_execution_failures import load_robot_execution_failures, download_robot_execution_failures +from tsfresh.examples.robot_execution_failures import ( + load_robot_execution_failures, + download_robot_execution_failures, +) from tsfresh.transformers import RelevantFeatureAugmenter from tests.fixtures import warning_free @@ -21,7 +24,9 @@ def setUp(self): temporary_file = os.path.join(self.temporary_folder, "data") download_robot_execution_failures(file_name=temporary_file) - self.timeseries, self.y = load_robot_execution_failures(file_name=temporary_file) + self.timeseries, self.y = load_robot_execution_failures( + file_name=temporary_file + ) self.df = pd.DataFrame(index=self.timeseries.id.unique()) # shrink the time series for this test @@ -35,7 +40,9 @@ def test_relevant_extraction(self): self.assertGreater(len(self.df), 0) self.assertGreater(len(self.timeseries), 0) - relevant_augmenter = RelevantFeatureAugmenter(column_id="id", column_sort="time") + relevant_augmenter = RelevantFeatureAugmenter( + column_id="id", column_sort="time" + ) relevant_augmenter.set_timeseries_container(self.timeseries) pipe = Pipeline([("relevant_augmenter", relevant_augmenter)]) @@ -44,23 +51,25 @@ def test_relevant_extraction(self): pipe.fit(self.df, self.y) extracted_features = pipe.transform(self.df) - some_expected_features = {'F_x__abs_energy', - 'F_x__absolute_sum_of_changes', - 'F_x__ar_coefficient__coeff_0__k_10', - 'F_x__autocorrelation__lag_1', - 'F_x__binned_entropy__max_bins_10', - 'F_x__count_above_mean', - 'F_x__longest_strike_above_mean', - 'F_x__maximum', - 'F_x__mean_abs_change', - 'F_x__minimum', - 'F_x__quantile__q_0.1', - 'F_x__range_count__max_1__min_-1', - 'F_x__spkt_welch_density__coeff_2', - 'F_x__standard_deviation', - 'F_x__value_count__value_0', - 'F_x__variance', - 'F_x__variance_larger_than_standard_deviation'} + some_expected_features = { + "F_x__abs_energy", + "F_x__absolute_sum_of_changes", + "F_x__ar_coefficient__coeff_0__k_10", + "F_x__autocorrelation__lag_1", + "F_x__binned_entropy__max_bins_10", + "F_x__count_above_mean", + "F_x__longest_strike_above_mean", + "F_x__maximum", + "F_x__mean_abs_change", + "F_x__minimum", + "F_x__quantile__q_0.1", + "F_x__range_count__max_1__min_-1", + "F_x__spkt_welch_density__coeff_2", + "F_x__standard_deviation", + "F_x__value_count__value_0", + "F_x__variance", + "F_x__variance_larger_than_standard_deviation", + } self.assertGreaterEqual(set(extracted_features.columns), some_expected_features) self.assertGreater(len(extracted_features), 0) diff --git a/tests/integrations/test_notebooks.py b/tests/integrations/test_notebooks.py index ad59ddcb1..73fe77150 100644 --- a/tests/integrations/test_notebooks.py +++ b/tests/integrations/test_notebooks.py @@ -21,25 +21,31 @@ def _notebook_run(path, timeout=default_timeout): """ dirname, _ = os.path.split(path) - execproc_timeout = '--ExecutePreprocessor.timeout=%d' % timeout + execproc_timeout = "--ExecutePreprocessor.timeout=%d" % timeout # Do not run notebook tests on Travis. notebooks tests should only be # run in the local developer testing context and notebook tests often # cause time out failures on Travis builds see (github #409, #410) try: - if os.environ['TRAVIS']: + if os.environ["TRAVIS"]: return [], [] except BaseException: pass # Ensure temporary files are not auto-deleted as processes have limited # permissions to re-use file handles under WinNT-based operating systems. - fname = '' - with tempfile.NamedTemporaryFile(mode='w+t', suffix=".ipynb", delete=False) as fout: + fname = "" + with tempfile.NamedTemporaryFile(mode="w+t", suffix=".ipynb", delete=False) as fout: fname = fout.name - args = ["jupyter", "nbconvert", - "--to", "notebook", "--execute", execproc_timeout] + args = [ + "jupyter", + "nbconvert", + "--to", + "notebook", + "--execute", + execproc_timeout, + ] args += ["--ExecutePreprocessor.kernel_name=python3"] args += ["--output", fout.name, path] subprocess.check_call(args) @@ -48,58 +54,93 @@ def _notebook_run(path, timeout=default_timeout): nb = nbformat.read(fout, nbformat.current_nbformat) os.remove(fname) - errors = [output for cell in nb.cells if "outputs" in cell - for output in cell["outputs"] - if output.output_type == "error"] + errors = [ + output + for cell in nb.cells + if "outputs" in cell + for output in cell["outputs"] + if output.output_type == "error" + ] return nb, errors -@pytest.mark.skipif(os.environ.get('TEST_NOTEBOOKS') != 'y', reason="Disabled notebook testing") +@pytest.mark.skipif( + os.environ.get("TEST_NOTEBOOKS") != "y", reason="Disabled notebook testing" +) class NotebooksTestCase(TestCase): def test_basic_example(self): - nb, errors = _notebook_run('notebooks/examples/01 Feature Extraction and Selection.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/examples/01 Feature Extraction and Selection.ipynb", + default_timeout, + ) self.assertEqual(errors, []) def test_pipeline_example(self): - nb, errors = _notebook_run('notebooks/examples/02 sklearn Pipeline.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/examples/02 sklearn Pipeline.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_extraction_settings(self): - nb, errors = _notebook_run('notebooks/examples/03 Feature Extraction Settings.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/examples/03 Feature Extraction Settings.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_multiclass_selection_example(self): - nb, errors = _notebook_run('notebooks/examples/04 Multiclass Selection Example.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/examples/04 Multiclass Selection Example.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_timeseries_forecasting(self): - nb, errors = _notebook_run('notebooks/examples/05 Timeseries Forecasting.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/examples/05 Timeseries Forecasting.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_timeseries_forecasting_exprt(self): - nb, errors = _notebook_run('notebooks/advanced/05 Timeseries Forecasting (multiple ids).ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/05 Timeseries Forecasting (multiple ids).ipynb", + default_timeout, + ) self.assertEqual(errors, []) def test_inspect_dft_features(self): - nb, errors = _notebook_run('notebooks/advanced/inspect_dft_features.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/inspect_dft_features.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_feature_extraction_with_datetime_index(self): - nb, errors = _notebook_run('notebooks/advanced/feature_extraction_with_datetime_index.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/feature_extraction_with_datetime_index.ipynb", + default_timeout, + ) self.assertEqual(errors, []) def test_friedrich_coefficients(self): - nb, errors = _notebook_run('notebooks/advanced/friedrich_coefficients.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/friedrich_coefficients.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_inspect_dft_features(self): - nb, errors = _notebook_run('notebooks/advanced/inspect_dft_features.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/inspect_dft_features.ipynb", default_timeout + ) self.assertEqual(errors, []) def test_perform_PCA_on_extracted_features(self): - nb, errors = _notebook_run('notebooks/advanced/perform-PCA-on-extracted-features.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/perform-PCA-on-extracted-features.ipynb", + default_timeout, + ) self.assertEqual(errors, []) def test_visualize_benjamini_yekutieli_procedure(self): - nb, errors = _notebook_run('notebooks/advanced/visualize-benjamini-yekutieli-procedure.ipynb', default_timeout) + nb, errors = _notebook_run( + "notebooks/advanced/visualize-benjamini-yekutieli-procedure.ipynb", + default_timeout, + ) self.assertEqual(errors, []) diff --git a/tests/integrations/test_relevant_feature_extraction.py b/tests/integrations/test_relevant_feature_extraction.py index f5455100b..2c1e16b89 100644 --- a/tests/integrations/test_relevant_feature_extraction.py +++ b/tests/integrations/test_relevant_feature_extraction.py @@ -26,28 +26,44 @@ def test_functional_equality(self): """ df, y = self.create_test_data_sample_with_target() - relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', - column_sort='sort') - - extracted_features = extract_features(df, column_id='id', - column_value='val', column_kind='kind', column_sort='sort', - impute_function=impute) + relevant_features = extract_relevant_features( + df, + y, + column_id="id", + column_value="val", + column_kind="kind", + column_sort="sort", + ) + + extracted_features = extract_features( + df, + column_id="id", + column_value="val", + column_kind="kind", + column_sort="sort", + impute_function=impute, + ) selected_features = select_features(extracted_features, y) self.assertEqual( - set(relevant_features.columns), set(selected_features.columns), - "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(relevant_features.columns, - selected_features.columns)) + set(relevant_features.columns), + set(selected_features.columns), + "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format( + relevant_features.columns, selected_features.columns + ), + ) relevant_columns = relevant_features.columns relevant_index = relevant_features.index self.assertTrue( - relevant_features.equals(selected_features.loc[relevant_index][relevant_columns]), - "Should calculate the same feature values") + relevant_features.equals( + selected_features.loc[relevant_index][relevant_columns] + ), + "Should calculate the same feature values", + ) class RelevantFeatureExtractionTestCase(TestCase): - def setUp(self): np.random.seed(42) y = pd.Series(np.random.binomial(1, 0.5, 20), index=range(20)) @@ -66,7 +82,7 @@ def setUp(self): self.y = y def test_extracted_features_contain_X_features(self): - X = extract_relevant_features(self.df, self.y, self.X, column_id='id') + X = extract_relevant_features(self.df, self.y, self.X, column_id="id") self.assertIn("f1", X.columns) self.assertIn("f2", X.columns) pdt.assert_series_equal(self.X["f1"], X["f1"]) @@ -76,37 +92,105 @@ def test_extracted_features_contain_X_features(self): def test_extraction_null_as_column_name(self): - df1 = pd.DataFrame(data={0: range(10), 1: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)}) + df1 = pd.DataFrame( + data={ + 0: range(10), + 1: np.repeat([0, 1], 5), + 2: np.repeat([0, 1, 2, 3, 4], 2), + } + ) X1 = extract_features(df1, column_id=1, column_sort=2) self.assertEqual(len(X1), 2) - df2 = pd.DataFrame(data={1: range(10), 0: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)}) + df2 = pd.DataFrame( + data={ + 1: range(10), + 0: np.repeat([0, 1], 5), + 2: np.repeat([0, 1, 2, 3, 4], 2), + } + ) X2 = extract_features(df2, column_id=0, column_sort=2) self.assertEqual(len(X2), 2) - df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)}) + df3 = pd.DataFrame( + data={ + 0: range(10), + 2: np.repeat([0, 1], 5), + 1: np.repeat([0, 1, 2, 3, 4], 2), + } + ) X3 = extract_features(df3, column_id=2, column_sort=1) self.assertEqual(len(X3), 2) def test_raises_mismatch_index_df_and_y_df_more(self): y = pd.Series(range(3), index=[1, 2, 3]) - df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} - self.assertRaises(ValueError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + df_dict = { + "a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]}), + } + self.assertRaises( + ValueError, + extract_relevant_features, + df_dict, + y, + None, + None, + None, + "id", + None, + "val", + ) def test_raises_mismatch_index_df_and_y_y_more(self): y = pd.Series(range(4), index=[1, 2, 3, 4]) df = pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}) - self.assertRaises(ValueError, extract_relevant_features, df, y, None, None, None, "id", None, "val") + self.assertRaises( + ValueError, + extract_relevant_features, + df, + y, + None, + None, + None, + "id", + None, + "val", + ) def test_raises_y_not_series(self): y = np.arange(10) - df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} - self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + df_dict = { + "a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]}), + } + self.assertRaises( + AssertionError, + extract_relevant_features, + df_dict, + y, + None, + None, + None, + "id", + None, + "val", + ) def test_raises_y_not_more_than_one_label(self): y = pd.Series(1, index=[1, 2, 3]) - df_dict = {"a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} - self.assertRaises(AssertionError, extract_relevant_features, df_dict, y, None, None, None, "id", None, "val") + df_dict = { + "a": pd.DataFrame({"val": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), + "b": pd.DataFrame({"val": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]}), + } + self.assertRaises( + AssertionError, + extract_relevant_features, + df_dict, + y, + None, + None, + None, + "id", + None, + "val", + ) diff --git a/tests/units/feature_extraction/test_data.py b/tests/units/feature_extraction/test_data.py index da89c8629..66c223be8 100644 --- a/tests/units/feature_extraction/test_data.py +++ b/tests/units/feature_extraction/test_data.py @@ -12,37 +12,247 @@ LongTsFrameAdapter, WideTsFrameAdapter, TsDictAdapter, - PartitionedTsData + PartitionedTsData, ) from tsfresh.utilities.distribution import MultiprocessingDistributor -TEST_DATA_EXPECTED_TUPLES = \ - [(10, 'a', pd.Series([36, 71, 27, 62, 56, 58, 67, 11, 2, 24, 45, 30, 0, - 9, 41, 28, 33, 19, 29, 43], - index=[10] * 20)), - (10, 'b', pd.Series([78, 37, 23, 44, 6, 3, 21, 61, 39, 31, 53, 16, 66, - 50, 40, 47, 7, 42, 38, 55], - index=[10] * 20)), - (500, 'a', pd.Series([76, 72, 74, 75, 32, 64, 46, 35, 15, 70, 57, 65, - 51, 26, 5, 25, 10, 69, 73, 77], - index=[500] * 20)), - (500, 'b', pd.Series([8, 60, 12, 68, 22, 17, 18, 63, 49, 34, 20, 52, - 48, 14, 79, 4, 1, 59, 54, 13], - index=[500] * 20))] - -WIDE_TEST_DATA_EXPECTED_TUPLES = \ - [(10, 'a', pd.Series([11, 9, 67, 45, 30, 58, 62, 19, 56, 29, 0, 27, 36, - 43, 33, 2, 24, 71, 41, 28], - index=list(range(20)))), - (10, 'b', pd.Series([50, 40, 39, 7, 53, 23, 16, 37, 66, 38, 6, 47, 3, - 61, 44, 42, 78, 31, 21, 55], - index=list(range(20)))), - (500, 'a', pd.Series([15, 35, 25, 32, 69, 65, 70, 64, 51, 46, 5, 77, - 26, 73, 76, 75, 72, 74, 10, 57], - index=list(range(20, 40)))), - (500, 'b', pd.Series([4, 14, 68, 22, 18, 52, 54, 60, 79, 12, 49, 63, - 8, 59, 1, 13, 20, 17, 48, 34], - index=list(range(20, 40))))] +TEST_DATA_EXPECTED_TUPLES = [ + ( + 10, + "a", + pd.Series( + [ + 36, + 71, + 27, + 62, + 56, + 58, + 67, + 11, + 2, + 24, + 45, + 30, + 0, + 9, + 41, + 28, + 33, + 19, + 29, + 43, + ], + index=[10] * 20, + ), + ), + ( + 10, + "b", + pd.Series( + [ + 78, + 37, + 23, + 44, + 6, + 3, + 21, + 61, + 39, + 31, + 53, + 16, + 66, + 50, + 40, + 47, + 7, + 42, + 38, + 55, + ], + index=[10] * 20, + ), + ), + ( + 500, + "a", + pd.Series( + [ + 76, + 72, + 74, + 75, + 32, + 64, + 46, + 35, + 15, + 70, + 57, + 65, + 51, + 26, + 5, + 25, + 10, + 69, + 73, + 77, + ], + index=[500] * 20, + ), + ), + ( + 500, + "b", + pd.Series( + [ + 8, + 60, + 12, + 68, + 22, + 17, + 18, + 63, + 49, + 34, + 20, + 52, + 48, + 14, + 79, + 4, + 1, + 59, + 54, + 13, + ], + index=[500] * 20, + ), + ), +] + +WIDE_TEST_DATA_EXPECTED_TUPLES = [ + ( + 10, + "a", + pd.Series( + [ + 11, + 9, + 67, + 45, + 30, + 58, + 62, + 19, + 56, + 29, + 0, + 27, + 36, + 43, + 33, + 2, + 24, + 71, + 41, + 28, + ], + index=list(range(20)), + ), + ), + ( + 10, + "b", + pd.Series( + [ + 50, + 40, + 39, + 7, + 53, + 23, + 16, + 37, + 66, + 38, + 6, + 47, + 3, + 61, + 44, + 42, + 78, + 31, + 21, + 55, + ], + index=list(range(20)), + ), + ), + ( + 500, + "a", + pd.Series( + [ + 15, + 35, + 25, + 32, + 69, + 65, + 70, + 64, + 51, + 46, + 5, + 77, + 26, + 73, + 76, + 75, + 72, + 74, + 10, + 57, + ], + index=list(range(20, 40)), + ), + ), + ( + 500, + "b", + pd.Series( + [ + 4, + 14, + 68, + 22, + 18, + 52, + 54, + 60, + 79, + 12, + 49, + 63, + 8, + 59, + 1, + 13, + 20, + 17, + 48, + 34, + ], + index=list(range(20, 40)), + ), + ), +] class DataAdapterTestCase(DataTestCase): @@ -86,16 +296,22 @@ def assert_data_chunk_object_equal(self, result, expected): dic_result = {str(x[0]) + "_" + str(x[1]): x[2] for x in result} dic_expected = {str(x[0]) + "_" + str(x[1]): x[2] for x in expected} for k in dic_result.keys(): - pd.testing.assert_series_equal(dic_result[k], dic_expected[k], check_names=False) + pd.testing.assert_series_equal( + dic_result[k], dic_expected[k], check_names=False + ) def test_simple_data_sample_two_timeseries(self): - df = pd.DataFrame({"id": [10] * 4, "kind": ["a"] * 2 + ["b"] * 2, "val": [36, 71, 78, 37]}) + df = pd.DataFrame( + {"id": [10] * 4, "kind": ["a"] * 2 + ["b"] * 2, "val": [36, 71, 78, 37]} + ) df.set_index("id", drop=False, inplace=True) df.index.name = None result = to_tsdata(df, "id", "kind", "val") - expected = [(10, 'a', pd.Series([36, 71], index=[10] * 2, name="val")), - (10, 'b', pd.Series([78, 37], index=[10] * 2, name="val"))] + expected = [ + (10, "a", pd.Series([36, 71], index=[10] * 2, name="val")), + (10, "b", pd.Series([78, 37], index=[10] * 2, name="val")), + ] self.assert_data_chunk_object_equal(result, expected) def test_simple_data_sample_four_timeseries(self): @@ -109,47 +325,60 @@ def test_simple_data_sample_four_timeseries(self): self.assert_data_chunk_object_equal(result, expected) def test_with_dictionaries_two_rows(self): - test_df = pd.DataFrame([{"value": 2, "sort": 2, "id": "id_1"}, - {"value": 1, "sort": 1, "id": "id_1"}]) + test_df = pd.DataFrame( + [ + {"value": 2, "sort": 2, "id": "id_1"}, + {"value": 1, "sort": 1, "id": "id_1"}, + ] + ) test_dict = {"a": test_df, "b": test_df} - result = to_tsdata(test_dict, column_id="id", column_value="value", column_sort="sort") - expected = [("id_1", 'a', pd.Series([1, 2], index=[1, 0], name="value")), - ("id_1", 'b', pd.Series([1, 2], index=[1, 0], name="value"))] + result = to_tsdata( + test_dict, column_id="id", column_value="value", column_sort="sort" + ) + expected = [ + ("id_1", "a", pd.Series([1, 2], index=[1, 0], name="value")), + ("id_1", "b", pd.Series([1, 2], index=[1, 0], name="value")), + ] self.assert_data_chunk_object_equal(result, expected) def test_with_dictionaries_two_rows(self): - test_df = pd.DataFrame([{"value": 1, "id": "id_1"}, - {"value": 2, "id": "id_1"}]) + test_df = pd.DataFrame([{"value": 1, "id": "id_1"}, {"value": 2, "id": "id_1"}]) test_dict = {"a": test_df, "b": test_df} result = to_tsdata(test_dict, column_id="id", column_value="value") - expected = [("id_1", 'a', pd.Series([1, 2], index=[0, 1], name="value")), - ("id_1", 'b', pd.Series([1, 2], index=[0, 1], name="value"))] + expected = [ + ("id_1", "a", pd.Series([1, 2], index=[0, 1], name="value")), + ("id_1", "b", pd.Series([1, 2], index=[0, 1], name="value")), + ] self.assert_data_chunk_object_equal(result, expected) def test_wide_dataframe_order_preserved_with_sort_column(self): - """ verifies that the order of the sort column from a wide time series container is preserved - """ + """verifies that the order of the sort column from a wide time series container is preserved""" - test_df = pd.DataFrame({'id': ["a", "a", "b"], - 'v1': [3, 2, 1], - 'v2': [13, 12, 11], - 'sort': [103, 102, 101]}) + test_df = pd.DataFrame( + { + "id": ["a", "a", "b"], + "v1": [3, 2, 1], + "v2": [13, 12, 11], + "sort": [103, 102, 101], + } + ) result = to_tsdata(test_df, column_id="id", column_sort="sort") - expected = [("a", 'v1', pd.Series([2, 3], index=[1, 0], name="v1")), - ("a", 'v2', pd.Series([12, 13], index=[1, 0], name="v2")), - ("b", 'v1', pd.Series([1], index=[2], name="v1")), - ("b", 'v2', pd.Series([11], index=[2], name="v2"))] + expected = [ + ("a", "v1", pd.Series([2, 3], index=[1, 0], name="v1")), + ("a", "v2", pd.Series([12, 13], index=[1, 0], name="v2")), + ("b", "v1", pd.Series([1], index=[2], name="v1")), + ("b", "v2", pd.Series([11], index=[2], name="v2")), + ] self.assert_data_chunk_object_equal(result, expected) def test_dask_dataframe_with_kind(self): - test_df = dd.from_pandas(pd.DataFrame({ - "id": [1, 2], - "kind": ["a", "a"], - "value": [1, 2] - }), npartitions=1) + test_df = dd.from_pandas( + pd.DataFrame({"id": [1, 2], "kind": ["a", "a"], "value": [1, 2]}), + npartitions=1, + ) result = to_tsdata(test_df, column_id="id", column_kind="kind") self.assertEqual(result.column_id, "id") @@ -157,158 +386,166 @@ def test_dask_dataframe_with_kind(self): self.assertEqual(result.column_value, "value") def test_f(chunk): - return pd.DataFrame({"id": chunk[0], "variable": chunk[1], "value": chunk[2]}) - - return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() - pd.testing.assert_frame_equal(return_f, pd.DataFrame({ - "id": [1, 2], - "variable": ["a", "a"], - "value": [1.0, 2.0] - })) + return pd.DataFrame( + {"id": chunk[0], "variable": chunk[1], "value": chunk[2]} + ) + + return_f = result.apply( + test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int")) + ).compute() + pd.testing.assert_frame_equal( + return_f, + pd.DataFrame({"id": [1, 2], "variable": ["a", "a"], "value": [1.0, 2.0]}), + ) def test_dask_dataframe_without_kind(self): - test_df = dd.from_pandas(pd.DataFrame({ - "id": [1, 2], - "value_a": [1, 2], - "value_b": [3, 4] - }), npartitions=1) + test_df = dd.from_pandas( + pd.DataFrame({"id": [1, 2], "value_a": [1, 2], "value_b": [3, 4]}), + npartitions=1, + ) result = to_tsdata(test_df, column_id="id") self.assertEqual(result.column_id, "id") def test_f(chunk): - return pd.DataFrame({"id": chunk[0], "variable": chunk[1], "value": chunk[2]}) - - return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() - pd.testing.assert_frame_equal(return_f.reset_index(drop=True), pd.DataFrame({ - "id": [1, 2, 1, 2], - "variable": ["value_a", "value_a", "value_b", "value_b"], - "value": [1.0, 2.0, 3.0, 4.0] - })) - - test_df = dd.from_pandas(pd.DataFrame({ - "id": [1, 1], - "sort": [2, 1], - "value_a": [1, 2], - "value_b": [3, 4] - }), npartitions=1) + return pd.DataFrame( + {"id": chunk[0], "variable": chunk[1], "value": chunk[2]} + ) + + return_f = result.apply( + test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int")) + ).compute() + pd.testing.assert_frame_equal( + return_f.reset_index(drop=True), + pd.DataFrame( + { + "id": [1, 2, 1, 2], + "variable": ["value_a", "value_a", "value_b", "value_b"], + "value": [1.0, 2.0, 3.0, 4.0], + } + ), + ) + + test_df = dd.from_pandas( + pd.DataFrame( + {"id": [1, 1], "sort": [2, 1], "value_a": [1, 2], "value_b": [3, 4]} + ), + npartitions=1, + ) result = to_tsdata(test_df, column_id="id", column_sort="sort") self.assertEqual(result.column_id, "id") def test_f(chunk): - return pd.DataFrame({"id": chunk[0], "variable": chunk[1], "value": chunk[2]}) - - return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() - - pd.testing.assert_frame_equal(return_f.reset_index(drop=True), pd.DataFrame({ - "id": [1, 1, 1, 1], - "variable": ["value_a", "value_a", "value_b", "value_b"], - "value": [2.0, 1.0, 4.0, 3.0] - })) + return pd.DataFrame( + {"id": chunk[0], "variable": chunk[1], "value": chunk[2]} + ) + + return_f = result.apply( + test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int")) + ).compute() + + pd.testing.assert_frame_equal( + return_f.reset_index(drop=True), + pd.DataFrame( + { + "id": [1, 1, 1, 1], + "variable": ["value_a", "value_a", "value_b", "value_b"], + "value": [2.0, 1.0, 4.0, 3.0], + } + ), + ) def test_with_wrong_input(self): test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": np.NaN}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", "kind", "value", "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", "kind", "value", "sort") test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "strange_id", "kind", "value", "sort") + self.assertRaises( + ValueError, to_tsdata, test_df, "strange_id", "kind", "value", "sort" + ) test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "strange_id", "kind", "value", "sort") - - test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "value_2": 1, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "strange_id", "kind", None, "sort") + self.assertRaises( + ValueError, to_tsdata, test_df, "strange_id", "kind", "value", "sort" + ) + + test_df = pd.DataFrame( + [{"id": 0, "kind": "a", "value": 3, "value_2": 1, "sort": 1}] + ) + self.assertRaises( + ValueError, to_tsdata, test_df, "strange_id", "kind", None, "sort" + ) test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "strange_id", "kind", None, "sort") + self.assertRaises( + ValueError, to_tsdata, test_df, "strange_id", "kind", None, "sort" + ) test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", "strange_kind", "value", "sort") + self.assertRaises( + ValueError, to_tsdata, test_df, "id", "strange_kind", "value", "sort" + ) test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", "strange_kind", "value", "sort") + self.assertRaises( + ValueError, to_tsdata, test_df, "id", "strange_kind", "value", "sort" + ) test_df = pd.DataFrame([{"id": np.NaN, "kind": "a", "value": 3, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", "kind", "value", "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", "kind", "value", "sort") test_df = pd.DataFrame([{"id": 0, "kind": np.NaN, "value": 3, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", "kind", "value", "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", "kind", "value", "sort") test_df = pd.DataFrame([{"id": 2}, {"id": 1}]) test_dd = dd.from_pandas(test_df, npartitions=1) test_dict = {"a": test_df, "b": test_df} # column_id needs to be given - self.assertRaises(ValueError, to_tsdata, test_df, - None, "a", "b", None) - self.assertRaises(ValueError, to_tsdata, test_dd, - None, "a", "b", None) - self.assertRaises(ValueError, to_tsdata, test_df, - None, "a", "b", "a") - self.assertRaises(ValueError, to_tsdata, test_dd, - None, "a", "b", "a") - self.assertRaises(ValueError, to_tsdata, test_dict, - None, "a", "b", None) - self.assertRaises(ValueError, to_tsdata, test_dict, - None, "a", "b", "a") + self.assertRaises(ValueError, to_tsdata, test_df, None, "a", "b", None) + self.assertRaises(ValueError, to_tsdata, test_dd, None, "a", "b", None) + self.assertRaises(ValueError, to_tsdata, test_df, None, "a", "b", "a") + self.assertRaises(ValueError, to_tsdata, test_dd, None, "a", "b", "a") + self.assertRaises(ValueError, to_tsdata, test_dict, None, "a", "b", None) + self.assertRaises(ValueError, to_tsdata, test_dict, None, "a", "b", "a") # If there are more than one column, the algorithm can not choose the correct column - self.assertRaises(ValueError, to_tsdata, test_dict, - "id", None, None, None) + self.assertRaises(ValueError, to_tsdata, test_dict, "id", None, None, None) - test_dict = {"a": pd.DataFrame([{"id": 2, "value_a": 3}, {"id": 1, "value_a": 4}]), - "b": pd.DataFrame([{"id": 2}, {"id": 1}])} + test_dict = { + "a": pd.DataFrame([{"id": 2, "value_a": 3}, {"id": 1, "value_a": 4}]), + "b": pd.DataFrame([{"id": 2}, {"id": 1}]), + } # If there are more than one column, the algorithm can not choose the correct column - self.assertRaises(ValueError, to_tsdata, test_dict, - "id", None, None, None) + self.assertRaises(ValueError, to_tsdata, test_dict, "id", None, None, None) test_df = pd.DataFrame([{"id": 0, "value": np.NaN}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, "value", None) + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, "value", None) test_df = pd.DataFrame([{"id": 0, "value": np.NaN}]) - self.assertRaises(ValueError, to_tsdata, test_df, - None, None, "value", None) + self.assertRaises(ValueError, to_tsdata, test_df, None, None, "value", None) test_df = pd.DataFrame([{"id": 0, "a_": 3, "b": 5, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = pd.DataFrame([{"id": 0, "a__c": 3, "b": 5, "sort": 1}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = pd.DataFrame([{"id": 0}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, None) + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, None) test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, None) + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, None) test_df = pd.DataFrame([{"id": 0, "sort": 0}]) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = dd.from_pandas(test_df, npartitions=1) - self.assertRaises(ValueError, to_tsdata, test_df, - "id", None, None, "sort") + self.assertRaises(ValueError, to_tsdata, test_df, "id", None, None, "sort") test_df = [1, 2, 3] - self.assertRaises(ValueError, to_tsdata, test_df, - "a", "b", "c", "d") + self.assertRaises(ValueError, to_tsdata, test_df, "a", "b", "c", "d") class PivotListTestCase(TestCase): @@ -352,7 +589,7 @@ def test_long_input(self): input_list = [] for i in range(100): for j in range(100): - input_list.append((i, j, i*j)) + input_list.append((i, j, i * j)) return_df = PartitionedTsData.pivot(mock_ts_data, input_list) diff --git a/tests/units/feature_extraction/test_extraction.py b/tests/units/feature_extraction/test_extraction.py index 5da421cd1..f6d761704 100644 --- a/tests/units/feature_extraction/test_extraction.py +++ b/tests/units/feature_extraction/test_extraction.py @@ -11,7 +11,10 @@ from tests.fixtures import DataTestCase from tsfresh.feature_extraction.extraction import extract_features -from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, PickeableSettings +from tsfresh.feature_extraction.settings import ( + ComprehensiveFCParameters, + PickeableSettings, +) from tsfresh.utilities.distribution import IterableDistributorBaseClass, MapDistributor @@ -25,34 +28,64 @@ def setUp(self): def test_extract_features(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - n_jobs=self.n_jobs) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() - extracted_features_sts = extract_features(df_sts, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - n_jobs=self.n_jobs) + extracted_features_sts = extract_features( + df_sts, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ) self.assertIsInstance(extracted_features_sts, pd.DataFrame) - self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) - self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) - self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1]))) + self.assertTrue( + np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])) + ) + self.assertTrue( + np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])) + ) + self.assertTrue( + np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])) + ) def test_extract_features_uses_only_kind_to_fc_settings(self): df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", - column_value="val", n_jobs=self.n_jobs, - kind_to_fc_parameters={"a": {"maximum": None, "minimum": None}}) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + kind_to_fc_parameters={"a": {"maximum": None, "minimum": None}}, + ) assert len(extracted_features) == 2 def test_extract_features_for_one_time_series(self): @@ -60,39 +93,63 @@ def test_extract_features_for_one_time_series(self): df = self.create_test_data_sample() settings = ComprehensiveFCParameters() - extracted_features = extract_features(df, default_fc_parameters=settings, - column_value="val", column_id="id", - column_kind="kind", column_sort="sort") + extracted_features = extract_features( + df, + default_fc_parameters=settings, + column_value="val", + column_id="id", + column_kind="kind", + column_sort="sort", + ) self.assertIsInstance(extracted_features, pd.DataFrame) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() - extracted_features_sts = extract_features(df_sts, default_fc_parameters=settings, - column_value="val", column_id="id", - column_kind="kind", column_sort="sort") + extracted_features_sts = extract_features( + df_sts, + default_fc_parameters=settings, + column_value="val", + column_id="id", + column_kind="kind", + column_sort="sort", + ) self.assertIsInstance(extracted_features_sts, pd.DataFrame) - self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) - self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) - self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1]))) + self.assertTrue( + np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])) + ) + self.assertTrue( + np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])) + ) + self.assertTrue( + np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])) + ) def test_extract_features_for_index_based_functions(self): df = self.create_test_data_sample_with_time_index() settings = { - 'linear_trend_timewise': [{"attr": "slope"}], - 'linear_trend': [{"attr": "slope"}] + "linear_trend_timewise": [{"attr": "slope"}], + "linear_trend": [{"attr": "slope"}], } - extracted_features = extract_features(df, default_fc_parameters=settings, - column_value="val", column_id="id", - column_kind="kind", - column_sort="sort") + extracted_features = extract_features( + df, + default_fc_parameters=settings, + column_value="val", + column_id="id", + column_kind="kind", + column_sort="sort", + ) self.assertIsInstance(extracted_features, pd.DataFrame) @@ -106,9 +163,10 @@ def test_extract_features_for_index_based_functions(self): # Test that the index of the returned df is the ID and not the timestamp self.assertTrue(extracted_features.index.dtype != df.index.dtype) - self.assertTrue(extracted_features.index.dtype == df['id'].dtype) + self.assertTrue(extracted_features.index.dtype == df["id"].dtype) self.assertEqual( - sorted(extracted_features.index.unique().tolist()), sorted(df['id'].unique().tolist()) + sorted(extracted_features.index.unique().tolist()), + sorted(df["id"].unique().tolist()), ) def test_extract_features_custom_function(self): @@ -117,21 +175,27 @@ def test_extract_features_custom_function(self): def custom_function(x, p): return len(x) + p - settings = PickeableSettings({ - 'mean': None, - custom_function: [{"p": 1}, {"p": -1}], - }) + settings = PickeableSettings( + { + "mean": None, + custom_function: [{"p": 1}, {"p": -1}], + } + ) - extracted_features = extract_features(df, default_fc_parameters=settings, - column_value="val", column_id="id", - column_kind="kind", - column_sort="sort") + extracted_features = extract_features( + df, + default_fc_parameters=settings, + column_value="val", + column_id="id", + column_kind="kind", + column_sort="sort", + ) self.assertIsInstance(extracted_features, pd.DataFrame) - mean_a = extracted_features['a__mean'].values - custom_function_a_1 = extracted_features['a__custom_function__p_1'].values - custom_function_a_m1 = extracted_features['a__custom_function__p_-1'].values + mean_a = extracted_features["a__mean"].values + custom_function_a_1 = extracted_features["a__custom_function__p_1"].values + custom_function_a_m1 = extracted_features["a__custom_function__p_-1"].values self.assertAlmostEqual(mean_a[0], 34.55) self.assertAlmostEqual(mean_a[1], 50.85) @@ -144,77 +208,125 @@ def test_extract_features_after_randomisation(self): df = self.create_test_data_sample() df_random = df.copy().sample(frac=1) - extracted_features = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", - column_value="val", - n_jobs=self.n_jobs).sort_index() - extracted_features_from_random = extract_features(df_random, column_id="id", - column_sort="sort", - column_kind="kind", - column_value="val", - n_jobs=self.n_jobs).sort_index() - - self.assertCountEqual(extracted_features.columns, - extracted_features_from_random.columns) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ).sort_index() + extracted_features_from_random = extract_features( + df_random, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ).sort_index() + + self.assertCountEqual( + extracted_features.columns, extracted_features_from_random.columns + ) for col in extracted_features: - self.assertIsNone(np.testing.assert_array_almost_equal(extracted_features[col], - extracted_features_from_random[ - col])) + self.assertIsNone( + np.testing.assert_array_almost_equal( + extracted_features[col], extracted_features_from_random[col] + ) + ) def test_profiling_file_written_out(self): - df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) + df = pd.DataFrame( + data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)} + ) profiling_filename = os.path.join(self.directory, "test_profiling.txt") - X = extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, - profile=True, profiling_filename=profiling_filename) + X = extract_features( + df, + column_id="id", + column_value="val", + n_jobs=self.n_jobs, + profile=True, + profiling_filename=profiling_filename, + ) self.assertTrue(os.path.isfile(profiling_filename)) os.remove(profiling_filename) def test_profiling_cumulative_file_written_out(self): - PROFILING_FILENAME = os.path.join(self.directory, "test_profiling_cumulative.txt") + PROFILING_FILENAME = os.path.join( + self.directory, "test_profiling_cumulative.txt" + ) PROFILING_SORTING = "cumulative" - df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) - extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, - profile=True, profiling_filename=PROFILING_FILENAME, - profiling_sorting=PROFILING_SORTING) + df = pd.DataFrame( + data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)} + ) + extract_features( + df, + column_id="id", + column_value="val", + n_jobs=self.n_jobs, + profile=True, + profiling_filename=PROFILING_FILENAME, + profiling_sorting=PROFILING_SORTING, + ) self.assertTrue(os.path.isfile(PROFILING_FILENAME)) os.remove(PROFILING_FILENAME) def test_extract_features_without_settings(self): - df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), - "value1": np.random.normal(0, 1, 20), - "value2": np.random.normal(0, 1, 20)}) - X = extract_features(df, column_id="id", - n_jobs=self.n_jobs) + df = pd.DataFrame( + data={ + "id": np.repeat([1, 2], 10), + "value1": np.random.normal(0, 1, 20), + "value2": np.random.normal(0, 1, 20), + } + ) + X = extract_features(df, column_id="id", n_jobs=self.n_jobs) self.assertIn("value1__maximum", list(X.columns)) self.assertIn("value2__maximum", list(X.columns)) def test_extract_features_with_and_without_parallelization(self): df = self.create_test_data_sample() - features_parallel = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - n_jobs=2) + features_parallel = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=2, + ) - features_serial = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - n_jobs=0) + features_serial = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=0, + ) self.assertCountEqual(features_parallel.columns, features_serial.columns) for col in features_parallel.columns: - np.testing.assert_array_almost_equal(features_parallel[col], features_serial[col]) + np.testing.assert_array_almost_equal( + features_parallel[col], features_serial[col] + ) def test_extract_index_preservation(self): df = self.create_test_data_nearly_numerical_indices() - extracted_features = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - n_jobs=self.n_jobs) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertEqual(set(df["id"]), set(extracted_features.index)) @@ -222,8 +334,13 @@ def test_extract_index_preservation(self): def test_extract_features_alphabetically_sorted(self): df = self.create_test_data_sample() - features = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val") + features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + ) for col_name in features.columns: # split out the configuration of the features calculator @@ -239,28 +356,42 @@ def setUp(self): self.n_jobs = 2 # only calculate some features to reduce load on travis ci - self.name_to_param = {"maximum": None, - "sum_values": None, - "abs_energy": None, - "minimum": None, - "mean": None, - "median": None} + self.name_to_param = { + "maximum": None, + "sum_values": None, + "abs_energy": None, + "minimum": None, + "mean": None, + "median": None, + } def test_extract_features(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", - column_kind="kind", - column_value="val", - n_jobs=self.n_jobs) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + n_jobs=self.n_jobs, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) @@ -277,9 +408,15 @@ def test_distributor_map_reduce_is_called(self): mock.close.return_value = None mock.map_reduce.return_value = [] - X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - default_fc_parameters=self.name_to_param, distributor=mock) + X = extract_features( + timeseries_container=df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + default_fc_parameters=self.name_to_param, + distributor=mock, + ) self.assertTrue(mock.map_reduce.called) @@ -290,8 +427,14 @@ def test_distributor_close_is_called(self): mock.close = Mock() mock.close.return_value = None - X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", - column_kind="kind", column_value="val", - default_fc_parameters=self.name_to_param, distributor=mock) + X = extract_features( + timeseries_container=df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + default_fc_parameters=self.name_to_param, + distributor=mock, + ) self.assertTrue(mock.close.called) diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py index e7d9e24b1..7fee5e073 100644 --- a/tests/units/feature_extraction/test_feature_calculations.py +++ b/tests/units/feature_extraction/test_feature_calculations.py @@ -10,7 +10,9 @@ from tsfresh.feature_extraction.feature_calculators import * from tsfresh.feature_extraction.feature_calculators import _roll from tsfresh.feature_extraction.feature_calculators import _get_length_sequences_where -from tsfresh.feature_extraction.feature_calculators import _estimate_friedrich_coefficients +from tsfresh.feature_extraction.feature_calculators import ( + _estimate_friedrich_coefficients, +) from tsfresh.feature_extraction.feature_calculators import _aggregate_on_chunks from tsfresh.feature_extraction.feature_calculators import _into_subchunks from tsfresh.examples.driftbif_simulation import velocity @@ -31,58 +33,114 @@ def assertIsNaN(self, result): def assertEqualOnAllArrayTypes(self, f, input_to_f, result, *args, **kwargs): expected_result = f(input_to_f, *args, **kwargs) - self.assertEqual(expected_result, result, - msg="Not equal for lists: {} != {}".format(expected_result, result)) + self.assertEqual( + expected_result, + result, + msg="Not equal for lists: {} != {}".format(expected_result, result), + ) expected_result = f(np.array(input_to_f), *args, **kwargs) - self.assertEqual(expected_result, result, - msg="Not equal for numpy.arrays: {} != {}".format(expected_result, result)) + self.assertEqual( + expected_result, + result, + msg="Not equal for numpy.arrays: {} != {}".format(expected_result, result), + ) expected_result = f(pd.Series(input_to_f, dtype="float64"), *args, **kwargs) - self.assertEqual(expected_result, result, - msg="Not equal for pandas.Series: {} != {}".format(expected_result, result)) + self.assertEqual( + expected_result, + result, + msg="Not equal for pandas.Series: {} != {}".format(expected_result, result), + ) def assertTrueOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): self.assertTrue(f(input_to_f, *args, **kwargs), msg="Not true for lists") - self.assertTrue(f(np.array(input_to_f), *args, **kwargs), msg="Not true for numpy.arrays") - self.assertTrue(f(pd.Series(input_to_f), *args, **kwargs), msg="Not true for pandas.Series") + self.assertTrue( + f(np.array(input_to_f), *args, **kwargs), msg="Not true for numpy.arrays" + ) + self.assertTrue( + f(pd.Series(input_to_f), *args, **kwargs), msg="Not true for pandas.Series" + ) def assertAllTrueOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): - self.assertTrue(all(dict(f(input_to_f, *args, **kwargs)).values()), msg="Not true for lists") - self.assertTrue(all(dict(f(np.array(input_to_f), *args, **kwargs)).values()), msg="Not true for numpy.arrays") - self.assertTrue(all(dict(f(pd.Series(input_to_f), *args, **kwargs)).values()), msg="Not true for pandas.Series") + self.assertTrue( + all(dict(f(input_to_f, *args, **kwargs)).values()), msg="Not true for lists" + ) + self.assertTrue( + all(dict(f(np.array(input_to_f), *args, **kwargs)).values()), + msg="Not true for numpy.arrays", + ) + self.assertTrue( + all(dict(f(pd.Series(input_to_f), *args, **kwargs)).values()), + msg="Not true for pandas.Series", + ) def assertFalseOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): self.assertFalse(f(input_to_f, *args, **kwargs), msg="Not false for lists") - self.assertFalse(f(np.array(input_to_f), *args, **kwargs), msg="Not false for numpy.arrays") - self.assertFalse(f(pd.Series(input_to_f), *args, **kwargs), msg="Not false for pandas.Series") + self.assertFalse( + f(np.array(input_to_f), *args, **kwargs), msg="Not false for numpy.arrays" + ) + self.assertFalse( + f(pd.Series(input_to_f), *args, **kwargs), msg="Not false for pandas.Series" + ) def assertAllFalseOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): - self.assertFalse(any(dict(f(input_to_f, *args, **kwargs)).values()), msg="Not false for lists") - self.assertFalse(any(dict(f(np.array(input_to_f), *args, **kwargs)).values()), - msg="Not false for numpy.arrays") - self.assertFalse(any(dict(f(pd.Series(input_to_f), *args, **kwargs)).values()), - msg="Not false for pandas.Series") + self.assertFalse( + any(dict(f(input_to_f, *args, **kwargs)).values()), + msg="Not false for lists", + ) + self.assertFalse( + any(dict(f(np.array(input_to_f), *args, **kwargs)).values()), + msg="Not false for numpy.arrays", + ) + self.assertFalse( + any(dict(f(pd.Series(input_to_f), *args, **kwargs)).values()), + msg="Not false for pandas.Series", + ) def assertAlmostEqualOnAllArrayTypes(self, f, input_to_f, result, *args, **kwargs): expected_result = f(input_to_f, *args, **kwargs) - self.assertAlmostEqual(expected_result, result, - msg="Not almost equal for lists: {} != {}".format(expected_result, result)) + self.assertAlmostEqual( + expected_result, + result, + msg="Not almost equal for lists: {} != {}".format(expected_result, result), + ) expected_result = f(np.array(input_to_f), *args, **kwargs) - self.assertAlmostEqual(expected_result, result, - msg="Not almost equal for numpy.arrays: {} != {}".format(expected_result, result)) + self.assertAlmostEqual( + expected_result, + result, + msg="Not almost equal for numpy.arrays: {} != {}".format( + expected_result, result + ), + ) expected_result = f(pd.Series(input_to_f, dtype="float64"), *args, **kwargs) - self.assertAlmostEqual(expected_result, result, - msg="Not almost equal for pandas.Series: {} != {}".format(expected_result, result)) + self.assertAlmostEqual( + expected_result, + result, + msg="Not almost equal for pandas.Series: {} != {}".format( + expected_result, result + ), + ) def assertIsNanOnAllArrayTypes(self, f, input_to_f, *args, **kwargs): - self.assertTrue(np.isnan(f(input_to_f, *args, **kwargs)), msg="Not NaN for lists") - self.assertTrue(np.isnan(f(np.array(input_to_f), *args, **kwargs)), msg="Not NaN for numpy.arrays") - self.assertTrue(np.isnan(f(pd.Series(input_to_f, dtype="float64"), *args, **kwargs)), - msg="Not NaN for pandas.Series") + self.assertTrue( + np.isnan(f(input_to_f, *args, **kwargs)), msg="Not NaN for lists" + ) + self.assertTrue( + np.isnan(f(np.array(input_to_f), *args, **kwargs)), + msg="Not NaN for numpy.arrays", + ) + self.assertTrue( + np.isnan(f(pd.Series(input_to_f, dtype="float64"), *args, **kwargs)), + msg="Not NaN for pandas.Series", + ) def assertEqualPandasSeriesWrapper(self, f, input_to_f, result, *args, **kwargs): - self.assertEqual(f(pd.Series(input_to_f), *args, **kwargs), result, - msg="Not equal for pandas.Series: {} != {}".format( - f(pd.Series(input_to_f), *args, **kwargs), result)) + self.assertEqual( + f(pd.Series(input_to_f), *args, **kwargs), + result, + msg="Not equal for pandas.Series: {} != {}".format( + f(pd.Series(input_to_f), *args, **kwargs), result + ), + ) def test__roll(self): x = np.random.normal(size=30) @@ -91,23 +149,39 @@ def test__roll(self): np.testing.assert_array_equal(_roll(x, -shift), np.roll(x, -shift)) def test___get_length_sequences_where(self): - self.assertEqualOnAllArrayTypes(_get_length_sequences_where, [0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1], - [1, 3, 1, 2]) - self.assertEqualOnAllArrayTypes(_get_length_sequences_where, - [0, True, 0, 0, True, True, True, 0, 0, True, 0, True, True], - [1, 3, 1, 2]) - self.assertEqualOnAllArrayTypes(_get_length_sequences_where, - [0, True, 0, 0, 1, True, 1, 0, 0, True, 0, 1, True], [1, 3, 1, 2]) + self.assertEqualOnAllArrayTypes( + _get_length_sequences_where, + [0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1], + [1, 3, 1, 2], + ) + self.assertEqualOnAllArrayTypes( + _get_length_sequences_where, + [0, True, 0, 0, True, True, True, 0, 0, True, 0, True, True], + [1, 3, 1, 2], + ) + self.assertEqualOnAllArrayTypes( + _get_length_sequences_where, + [0, True, 0, 0, 1, True, 1, 0, 0, True, 0, 1, True], + [1, 3, 1, 2], + ) self.assertEqualOnAllArrayTypes(_get_length_sequences_where, [0] * 10, [0]) self.assertEqualOnAllArrayTypes(_get_length_sequences_where, [], [0]) def test__into_subchunks(self): - np.testing.assert_array_equal(_into_subchunks(range(7), 3, 2), np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]])) - np.testing.assert_array_equal(_into_subchunks(range(5), 3), np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4]])) + np.testing.assert_array_equal( + _into_subchunks(range(7), 3, 2), np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]]) + ) + np.testing.assert_array_equal( + _into_subchunks(range(5), 3), np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4]]) + ) def test_variance_larger_than_standard_deviation(self): - self.assertFalseOnAllArrayTypes(variance_larger_than_standard_deviation, [-1, -1, 1, 1, 1]) - self.assertTrueOnAllArrayTypes(variance_larger_than_standard_deviation, [-1, -1, 1, 1, 2]) + self.assertFalseOnAllArrayTypes( + variance_larger_than_standard_deviation, [-1, -1, 1, 1, 1] + ) + self.assertTrueOnAllArrayTypes( + variance_larger_than_standard_deviation, [-1, -1, 1, 1, 2] + ) def test_large_standard_deviation(self): self.assertFalseOnAllArrayTypes(large_standard_deviation, [1, 1, 1, 1], r=0) @@ -118,16 +192,27 @@ def test_large_standard_deviation(self): self.assertFalseOnAllArrayTypes(large_standard_deviation, [-1, -1, 1, 1], r=0.5) def test_symmetry_looking(self): - self.assertAllTrueOnAllArrayTypes(symmetry_looking, [-1, -1, 1, 1], - [dict(r=0.05), dict(r=0.75)]) - self.assertAllFalseOnAllArrayTypes(symmetry_looking, [-1, -1, 1, 1], [dict(r=0)]) - self.assertAllFalseOnAllArrayTypes(symmetry_looking, [-1, -1, -1, -1, 1], [dict(r=0.05)]) - self.assertAllTrueOnAllArrayTypes(symmetry_looking, [-2, -2, -2, -1, -1, -1], [dict(r=0.05)]) - self.assertAllTrueOnAllArrayTypes(symmetry_looking, [-0.9, -0.900001], [dict(r=0.05)]) + self.assertAllTrueOnAllArrayTypes( + symmetry_looking, [-1, -1, 1, 1], [dict(r=0.05), dict(r=0.75)] + ) + self.assertAllFalseOnAllArrayTypes( + symmetry_looking, [-1, -1, 1, 1], [dict(r=0)] + ) + self.assertAllFalseOnAllArrayTypes( + symmetry_looking, [-1, -1, -1, -1, 1], [dict(r=0.05)] + ) + self.assertAllTrueOnAllArrayTypes( + symmetry_looking, [-2, -2, -2, -1, -1, -1], [dict(r=0.05)] + ) + self.assertAllTrueOnAllArrayTypes( + symmetry_looking, [-0.9, -0.900001], [dict(r=0.05)] + ) def test_has_duplicate_max(self): self.assertTrueOnAllArrayTypes(has_duplicate_max, [2.1, 0, 0, 2.1, 1.1]) - self.assertFalseOnAllArrayTypes(has_duplicate_max, np.array([2.1, 0, 0, 2, 1.1])) + self.assertFalseOnAllArrayTypes( + has_duplicate_max, np.array([2.1, 0, 0, 2, 1.1]) + ) self.assertTrueOnAllArrayTypes(has_duplicate_max, [1, 1, 1, 1]) self.assertFalseOnAllArrayTypes(has_duplicate_max, np.array([0])) self.assertTrueOnAllArrayTypes(has_duplicate_max, np.array([1, 1])) @@ -156,44 +241,43 @@ def test_agg_autocorrelation_returns_correct_values(self): param = [{"f_agg": "mean", "maxlag": 10}] x = [1, 1, 1, 1, 1, 1, 1] expected_res = 0 - res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"] + res = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_10'] self.assertAlmostEqual(res, expected_res, places=4) x = [1, 2, -3] expected_res = 1 / np.var(x) * (((1 * 2 + 2 * (-3)) / 2 + (1 * -3)) / 2) - res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"] + res = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_10'] self.assertAlmostEqual(res, expected_res, places=4) np.random.seed(42) x = np.random.normal(size=3000) expected_res = 0 - res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"] + res = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_10'] self.assertAlmostEqual(res, expected_res, places=2) param = [{"f_agg": "median", "maxlag": 10}] x = [1, 1, 1, 1, 1, 1, 1] expected_res = 0 - res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\"__maxlag_10"] + res = dict(agg_autocorrelation(x, param=param))['f_agg_"median"__maxlag_10'] self.assertAlmostEqual(res, expected_res, places=4) x = [1, 2, -3] expected_res = 1 / np.var(x) * (((1 * 2 + 2 * (-3)) / 2 + (1 * -3)) / 2) - res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\"__maxlag_10"] + res = dict(agg_autocorrelation(x, param=param))['f_agg_"median"__maxlag_10'] self.assertAlmostEqual(res, expected_res, places=4) def test_agg_autocorrelation_returns_max_lag_does_not_affect_other_results(self): - param = [{"f_agg": "mean", "maxlag": 1}, - {"f_agg": "mean", "maxlag": 10}] + param = [{"f_agg": "mean", "maxlag": 1}, {"f_agg": "mean", "maxlag": 10}] x = range(10) - res1 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_1"] - res10 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"] + res1 = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_1'] + res10 = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_10'] self.assertAlmostEqual(res1, 0.77777777, places=4) self.assertAlmostEqual(res10, -0.64983164983165, places=4) param = [{"f_agg": "mean", "maxlag": 1}] x = range(10) - res1 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_1"] + res1 = dict(agg_autocorrelation(x, param=param))['f_agg_"mean"__maxlag_1'] self.assertAlmostEqual(res1, 0.77777777, places=4) def test_partial_autocorrelation(self): @@ -272,7 +356,7 @@ def test_augmented_dickey_fuller(self): param = [ {"autolag": "BIC", "attr": "teststat"}, {"autolag": "BIC", "attr": "pvalue"}, - {"autolag": "BIC", "attr": "usedlag"} + {"autolag": "BIC", "attr": "usedlag"}, ] expected_index = [ 'attr_"teststat"__autolag_"BIC"', @@ -297,7 +381,7 @@ def test_augmented_dickey_fuller(self): param = [ {"autolag": "AIC", "attr": "teststat"}, {"autolag": "AIC", "attr": "pvalue"}, - {"autolag": "AIC", "attr": "usedlag"} + {"autolag": "AIC", "attr": "usedlag"}, ] expected_index = [ 'attr_"teststat"__autolag_"AIC"', @@ -312,7 +396,9 @@ def test_augmented_dickey_fuller(self): self.assertEqual(res['attr_"usedlag"__autolag_"AIC"'], 0) # Check if LinAlgError and ValueError are catched - res_linalg_error = augmented_dickey_fuller(x=np.repeat(np.nan, 100), param=param) + res_linalg_error = augmented_dickey_fuller( + x=np.repeat(np.nan, 100), param=param + ) res_value_error = augmented_dickey_fuller(x=[], param=param) for index, val in res_linalg_error: self.assertIsNaN(val) @@ -320,7 +406,9 @@ def test_augmented_dickey_fuller(self): self.assertIsNaN(val) # Should return NaN if "attr" is unknown - res_attr_error = augmented_dickey_fuller(x=x, param=[{"autolag": "AIC", "attr": ""}]) + res_attr_error = augmented_dickey_fuller( + x=x, param=[{"autolag": "AIC", "attr": ""}] + ) for index, val in res_attr_error: self.assertIsNaN(val) @@ -338,73 +426,118 @@ def test_cid_ce(self): self.assertEqualOnAllArrayTypes(cid_ce, [1, 1, 1], 0, normalize=False) self.assertEqualOnAllArrayTypes(cid_ce, [0.5, 3.5, 7.5], 5, normalize=False) - self.assertEqualOnAllArrayTypes(cid_ce, [-4.33, -1.33, 2.67], 5, normalize=False) + self.assertEqualOnAllArrayTypes( + cid_ce, [-4.33, -1.33, 2.67], 5, normalize=False + ) def test_lempel_ziv_complexity(self): - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2. / 3, bins=2) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2. / 3, bins=5) - - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 1, 1, 1, 1], - 0.4285714285, bins=2) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 2, 1, 1, 1], - 0.5714285714, bins=2) - - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, - [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], - 0.8, bins=10) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, - [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6], - 0.4, bins=10) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, - np.linspace(0, 1, 10), - 0.6, bins=3) - self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, - [1, 1, 2, 3, 4, 5, 6, 0, 7, 8], - 0.6, bins=3) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [1, 1, 1], 2.0 / 3, bins=2 + ) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [1, 1, 1], 2.0 / 3, bins=5 + ) + + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [1, 1, 1, 1, 1, 1, 1], 0.4285714285, bins=2 + ) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [1, 1, 1, 2, 1, 1, 1], 0.5714285714, bins=2 + ) + + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], 0.8, bins=10 + ) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, + [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6], + 0.4, + bins=10, + ) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, np.linspace(0, 1, 10), 0.6, bins=3 + ) + self.assertAlmostEqualOnAllArrayTypes( + lempel_ziv_complexity, [1, 1, 2, 3, 4, 5, 6, 0, 7, 8], 0.6, bins=3 + ) def test_fourier_entropy(self): - self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=2) - self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=5) + self.assertAlmostEqualOnAllArrayTypes( + fourier_entropy, [1, 2, 1], 0.693147180, bins=2 + ) + self.assertAlmostEqualOnAllArrayTypes( + fourier_entropy, [1, 2, 1], 0.693147180, bins=5 + ) - self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 2, 1, 1, 1, 1], - 0.5623351446188083, bins=5) - self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 1, 1, 2, 1, 1], - 1.0397207708399179, bins=5) + self.assertAlmostEqualOnAllArrayTypes( + fourier_entropy, [1, 1, 2, 1, 1, 1, 1], 0.5623351446188083, bins=5 + ) + self.assertAlmostEqualOnAllArrayTypes( + fourier_entropy, [1, 1, 1, 1, 2, 1, 1], 1.0397207708399179, bins=5 + ) - self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, - [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], - 1.5607104090414063, bins=10) - self.assertIsNanOnAllArrayTypes(fourier_entropy, - [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6], - bins=10) + self.assertAlmostEqualOnAllArrayTypes( + fourier_entropy, + [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], + 1.5607104090414063, + bins=10, + ) + self.assertIsNanOnAllArrayTypes( + fourier_entropy, [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6], bins=10 + ) def test_permutation_entropy(self): - self.assertAlmostEqualOnAllArrayTypes(permutation_entropy, [4, 7, 9, 10, 6, 11, 3], 1.054920167, - dimension=3, tau=1) + self.assertAlmostEqualOnAllArrayTypes( + permutation_entropy, + [4, 7, 9, 10, 6, 11, 3], + 1.054920167, + dimension=3, + tau=1, + ) # should grow - self.assertAlmostEqualOnAllArrayTypes(permutation_entropy, [1, -1, 1, -1, 1, -1, 1, -1], - 0.6931471805599453, dimension=3, tau=1) - self.assertAlmostEqualOnAllArrayTypes(permutation_entropy, [1, -1, 1, -1, 1, 1, 1, -1], - 1.3296613488547582, dimension=3, tau=1) - - self.assertAlmostEqualOnAllArrayTypes(permutation_entropy, - [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], - 1.0397207708399179, dimension=3, tau=2) + self.assertAlmostEqualOnAllArrayTypes( + permutation_entropy, + [1, -1, 1, -1, 1, -1, 1, -1], + 0.6931471805599453, + dimension=3, + tau=1, + ) + self.assertAlmostEqualOnAllArrayTypes( + permutation_entropy, + [1, -1, 1, -1, 1, 1, 1, -1], + 1.3296613488547582, + dimension=3, + tau=1, + ) + + self.assertAlmostEqualOnAllArrayTypes( + permutation_entropy, + [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6], + 1.0397207708399179, + dimension=3, + tau=2, + ) # nan is treated like any other number - self.assertAlmostEqualOnAllArrayTypes(permutation_entropy, - [-1, 4.3, 5, 1, -4.5, 1, 5, np.nan, -3.4, 6], - 1.0397207708399179, dimension=3, tau=2) + self.assertAlmostEqualOnAllArrayTypes( + permutation_entropy, + [-1, 4.3, 5, 1, -4.5, 1, 5, np.nan, -3.4, 6], + 1.0397207708399179, + dimension=3, + tau=2, + ) # if too short, return nan - self.assertIsNanOnAllArrayTypes(permutation_entropy, [1, -1], dimension=3, tau=1) + self.assertIsNanOnAllArrayTypes( + permutation_entropy, [1, -1], dimension=3, tau=1 + ) def test_ratio_beyond_r_sigma(self): x = [0, 1] * 10 + [10, 20, -30] # std of x is 7.21, mean 3.04 - self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 3. / len(x), r=1) - self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 2. / len(x), r=2) - self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 1. / len(x), r=3) + self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 3.0 / len(x), r=1) + self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 2.0 / len(x), r=2) + self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 1.0 / len(x), r=3) self.assertEqualOnAllArrayTypes(ratio_beyond_r_sigma, x, 0, r=20) def test_mean_abs_change(self): @@ -419,9 +552,13 @@ def test_mean_change(self): self.assertIsNanOnAllArrayTypes(mean_change, []) def test_mean_second_derivate_central(self): - self.assertEqualOnAllArrayTypes(mean_second_derivative_central, list(range(10)), 0) + self.assertEqualOnAllArrayTypes( + mean_second_derivative_central, list(range(10)), 0 + ) self.assertEqualOnAllArrayTypes(mean_second_derivative_central, [1, 3, 5], 0) - self.assertEqualOnAllArrayTypes(mean_second_derivative_central, [1, 3, 7, -3], -3) + self.assertEqualOnAllArrayTypes( + mean_second_derivative_central, [1, 3, 7, -3], -3 + ) def test_median(self): self.assertEqualOnAllArrayTypes(median, [1, 1, 2, 2], 1.5) @@ -444,13 +581,22 @@ def test_length(self): def test_standard_deviation(self): self.assertAlmostEqualOnAllArrayTypes(standard_deviation, [1, 1, -1, -1], 1) - self.assertAlmostEqualOnAllArrayTypes(standard_deviation, [1, 2, -2, -1], 1.58113883008) + self.assertAlmostEqualOnAllArrayTypes( + standard_deviation, [1, 2, -2, -1], 1.58113883008 + ) self.assertIsNanOnAllArrayTypes(standard_deviation, []) def test_variation_coefficient(self): - self.assertIsNanOnAllArrayTypes(variation_coefficient, [1, 1, -1, -1], ) - self.assertAlmostEqualOnAllArrayTypes(variation_coefficient, [1, 2, -3, -1], -7.681145747868608) - self.assertAlmostEqualOnAllArrayTypes(variation_coefficient, [1, 2, 4, -1], 1.2018504251546631) + self.assertIsNanOnAllArrayTypes( + variation_coefficient, + [1, 1, -1, -1], + ) + self.assertAlmostEqualOnAllArrayTypes( + variation_coefficient, [1, 2, -3, -1], -7.681145747868608 + ) + self.assertAlmostEqualOnAllArrayTypes( + variation_coefficient, [1, 2, 4, -1], 1.2018504251546631 + ) self.assertIsNanOnAllArrayTypes(variation_coefficient, []) def test_variance(self): @@ -460,17 +606,23 @@ def test_variance(self): def test_skewness(self): self.assertEqualOnAllArrayTypes(skewness, [1, 1, 1, 2, 2, 2], 0) - self.assertAlmostEqualOnAllArrayTypes(skewness, [1, 1, 1, 2, 2], 0.6085806194501855) + self.assertAlmostEqualOnAllArrayTypes( + skewness, [1, 1, 1, 2, 2], 0.6085806194501855 + ) self.assertEqualOnAllArrayTypes(skewness, [1, 1, 1], 0) self.assertIsNanOnAllArrayTypes(skewness, [1, 1]) def test_kurtosis(self): - self.assertAlmostEqualOnAllArrayTypes(kurtosis, [1, 1, 1, 2, 2], -3.333333333333333) + self.assertAlmostEqualOnAllArrayTypes( + kurtosis, [1, 1, 1, 2, 2], -3.333333333333333 + ) self.assertAlmostEqualOnAllArrayTypes(kurtosis, [1, 1, 1, 1], 0) self.assertIsNanOnAllArrayTypes(kurtosis, [1, 1, 1]) def test_root_mean_square(self): - self.assertAlmostEqualOnAllArrayTypes(root_mean_square, [1, 1, 1, 2, 2], 1.4832396974191) + self.assertAlmostEqualOnAllArrayTypes( + root_mean_square, [1, 1, 1, 2, 2], 1.4832396974191 + ) self.assertAlmostEqualOnAllArrayTypes(root_mean_square, [0], 0) self.assertIsNanOnAllArrayTypes(root_mean_square, []) self.assertAlmostEqualOnAllArrayTypes(root_mean_square, [1], 1) @@ -478,12 +630,24 @@ def test_root_mean_square(self): def test_mean_n_absolute_max(self): self.assertIsNanOnAllArrayTypes(mean_n_absolute_max, [], number_of_maxima=1) - self.assertIsNanOnAllArrayTypes(mean_n_absolute_max, [12, 3], number_of_maxima=10) - self.assertRaises(AssertionError, mean_n_absolute_max, [12, 3], number_of_maxima=0) - self.assertRaises(AssertionError, mean_n_absolute_max, [12, 3], number_of_maxima=-1) - self.assertAlmostEqualOnAllArrayTypes(mean_n_absolute_max, [-1, -5, 4, 10], 6.33333333333, number_of_maxima=3) - self.assertAlmostEqualOnAllArrayTypes(mean_n_absolute_max, [0, -5, -9], 7.000000, number_of_maxima=2) - self.assertAlmostEqualOnAllArrayTypes(mean_n_absolute_max, [0, 0, 0], 0, number_of_maxima=1) + self.assertIsNanOnAllArrayTypes( + mean_n_absolute_max, [12, 3], number_of_maxima=10 + ) + self.assertRaises( + AssertionError, mean_n_absolute_max, [12, 3], number_of_maxima=0 + ) + self.assertRaises( + AssertionError, mean_n_absolute_max, [12, 3], number_of_maxima=-1 + ) + self.assertAlmostEqualOnAllArrayTypes( + mean_n_absolute_max, [-1, -5, 4, 10], 6.33333333333, number_of_maxima=3 + ) + self.assertAlmostEqualOnAllArrayTypes( + mean_n_absolute_max, [0, -5, -9], 7.000000, number_of_maxima=2 + ) + self.assertAlmostEqualOnAllArrayTypes( + mean_n_absolute_max, [0, 0, 0], 0, number_of_maxima=1 + ) def test_absolute_sum_of_changes(self): self.assertEqualOnAllArrayTypes(absolute_sum_of_changes, [1, 1, 1, 1, 2, 1], 2) @@ -492,15 +656,23 @@ def test_absolute_sum_of_changes(self): self.assertEqualOnAllArrayTypes(absolute_sum_of_changes, [], 0) def test_longest_strike_below_mean(self): - self.assertEqualOnAllArrayTypes(longest_strike_below_mean, [1, 2, 1, 1, 1, 2, 2, 2], 3) - self.assertEqualOnAllArrayTypes(longest_strike_below_mean, [1, 2, 3, 4, 5, 6], 3) + self.assertEqualOnAllArrayTypes( + longest_strike_below_mean, [1, 2, 1, 1, 1, 2, 2, 2], 3 + ) + self.assertEqualOnAllArrayTypes( + longest_strike_below_mean, [1, 2, 3, 4, 5, 6], 3 + ) self.assertEqualOnAllArrayTypes(longest_strike_below_mean, [1, 2, 3, 4, 5], 2) self.assertEqualOnAllArrayTypes(longest_strike_below_mean, [1, 2, 1], 1) self.assertEqualOnAllArrayTypes(longest_strike_below_mean, [], 0) def test_longest_strike_above_mean(self): - self.assertEqualOnAllArrayTypes(longest_strike_above_mean, [1, 2, 1, 2, 1, 2, 2, 1], 2) - self.assertEqualOnAllArrayTypes(longest_strike_above_mean, [1, 2, 3, 4, 5, 6], 3) + self.assertEqualOnAllArrayTypes( + longest_strike_above_mean, [1, 2, 1, 2, 1, 2, 2, 1], 2 + ) + self.assertEqualOnAllArrayTypes( + longest_strike_above_mean, [1, 2, 3, 4, 5, 6], 3 + ) self.assertEqualOnAllArrayTypes(longest_strike_above_mean, [1, 2, 3, 4, 5], 2) self.assertEqualOnAllArrayTypes(longest_strike_above_mean, [1, 2, 1], 1) self.assertEqualOnAllArrayTypes(longest_strike_above_mean, [], 0) @@ -518,86 +690,178 @@ def test_count_below_mean(self): self.assertEqualOnAllArrayTypes(count_below_mean, [], 0) def test_last_location_maximum(self): - self.assertAlmostEqualOnAllArrayTypes(last_location_of_maximum, [1, 2, 1, 2, 1], 0.8) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_maximum, [1, 2, 1, 1, 2], 1.0) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_maximum, [2, 1, 1, 1, 1], 0.2) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_maximum, [1, 1, 1, 1, 1], 1.0) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_maximum, [1, 2, 1, 2, 1], 0.8 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_maximum, [1, 2, 1, 1, 2], 1.0 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_maximum, [2, 1, 1, 1, 1], 0.2 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_maximum, [1, 1, 1, 1, 1], 1.0 + ) self.assertAlmostEqualOnAllArrayTypes(last_location_of_maximum, [1], 1.0) self.assertIsNanOnAllArrayTypes(last_location_of_maximum, []) def test_first_location_of_maximum(self): - self.assertAlmostEqualOnAllArrayTypes(first_location_of_maximum, [1, 2, 1, 2, 1], 0.2) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_maximum, [1, 2, 1, 1, 2], 0.2) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_maximum, [2, 1, 1, 1, 1], 0.0) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_maximum, [1, 1, 1, 1, 1], 0.0) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_maximum, [1, 2, 1, 2, 1], 0.2 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_maximum, [1, 2, 1, 1, 2], 0.2 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_maximum, [2, 1, 1, 1, 1], 0.0 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_maximum, [1, 1, 1, 1, 1], 0.0 + ) self.assertAlmostEqualOnAllArrayTypes(first_location_of_maximum, [1], 0.0) self.assertIsNanOnAllArrayTypes(first_location_of_maximum, []) def test_last_location_of_minimum(self): - self.assertAlmostEqualOnAllArrayTypes(last_location_of_minimum, [1, 2, 1, 2, 1], 1.0) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_minimum, [1, 2, 1, 2, 2], 0.6) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_minimum, [2, 1, 1, 1, 2], 0.8) - self.assertAlmostEqualOnAllArrayTypes(last_location_of_minimum, [1, 1, 1, 1, 1], 1.0) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_minimum, [1, 2, 1, 2, 1], 1.0 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_minimum, [1, 2, 1, 2, 2], 0.6 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_minimum, [2, 1, 1, 1, 2], 0.8 + ) + self.assertAlmostEqualOnAllArrayTypes( + last_location_of_minimum, [1, 1, 1, 1, 1], 1.0 + ) self.assertAlmostEqualOnAllArrayTypes(last_location_of_minimum, [1], 1.0) self.assertIsNanOnAllArrayTypes(last_location_of_minimum, []) def test_first_location_of_minimum(self): - self.assertAlmostEqualOnAllArrayTypes(first_location_of_minimum, [1, 2, 1, 2, 1], 0.0) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_minimum, [2, 2, 1, 2, 2], 0.4) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_minimum, [2, 1, 1, 1, 2], 0.2) - self.assertAlmostEqualOnAllArrayTypes(first_location_of_minimum, [1, 1, 1, 1, 1], 0.0) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_minimum, [1, 2, 1, 2, 1], 0.0 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_minimum, [2, 2, 1, 2, 2], 0.4 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_minimum, [2, 1, 1, 1, 2], 0.2 + ) + self.assertAlmostEqualOnAllArrayTypes( + first_location_of_minimum, [1, 1, 1, 1, 1], 0.0 + ) self.assertAlmostEqualOnAllArrayTypes(first_location_of_minimum, [1], 0.0) self.assertIsNanOnAllArrayTypes(first_location_of_minimum, []) def test_percentage_of_doubled_datapoints(self): - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_datapoints_to_all_datapoints, [1, 1, 2, 3, 4], - 0.4) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_datapoints_to_all_datapoints, [1, 1.5, 2, 3], 0) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_datapoints_to_all_datapoints, [1], 0) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_datapoints_to_all_datapoints, - [1.111, -2.45, 1.111, 2.45], 0.5) - self.assertIsNanOnAllArrayTypes(percentage_of_reoccurring_datapoints_to_all_datapoints, []) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_datapoints_to_all_datapoints, [1, 1, 2, 3, 4], 0.4 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_datapoints_to_all_datapoints, [1, 1.5, 2, 3], 0 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_datapoints_to_all_datapoints, [1], 0 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_datapoints_to_all_datapoints, + [1.111, -2.45, 1.111, 2.45], + 0.5, + ) + self.assertIsNanOnAllArrayTypes( + percentage_of_reoccurring_datapoints_to_all_datapoints, [] + ) def test_ratio_of_doubled_values(self): - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_values_to_all_values, [1, 1, 2, 3, 4], 0.25) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_values_to_all_values, [1, 1.5, 2, 3], 0) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_values_to_all_values, [1], 0) - self.assertAlmostEqualOnAllArrayTypes(percentage_of_reoccurring_values_to_all_values, - [1.111, -2.45, 1.111, 2.45], 1.0 / 3.0) - self.assertIsNanOnAllArrayTypes(percentage_of_reoccurring_values_to_all_values, []) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_values_to_all_values, [1, 1, 2, 3, 4], 0.25 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_values_to_all_values, [1, 1.5, 2, 3], 0 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_values_to_all_values, [1], 0 + ) + self.assertAlmostEqualOnAllArrayTypes( + percentage_of_reoccurring_values_to_all_values, + [1.111, -2.45, 1.111, 2.45], + 1.0 / 3.0, + ) + self.assertIsNanOnAllArrayTypes( + percentage_of_reoccurring_values_to_all_values, [] + ) def test_sum_of_reoccurring_values(self): - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_values, [1, 1, 2, 3, 4, 4], 5) - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_values, [1, 1.5, 2, 3], 0) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_values, [1, 1, 2, 3, 4, 4], 5 + ) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_values, [1, 1.5, 2, 3], 0 + ) self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_values, [1], 0) - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_values, [1.111, -2.45, 1.111, 2.45], 1.111) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_values, [1.111, -2.45, 1.111, 2.45], 1.111 + ) self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_values, [], 0) def test_sum_of_reoccurring_data_points(self): - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_data_points, [1, 1, 2, 3, 4, 4], 10) - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_data_points, [1, 1.5, 2, 3], 0) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_data_points, [1, 1, 2, 3, 4, 4], 10 + ) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_data_points, [1, 1.5, 2, 3], 0 + ) self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_data_points, [1], 0) - self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_data_points, [1.111, -2.45, 1.111, 2.45], 2.222) + self.assertAlmostEqualOnAllArrayTypes( + sum_of_reoccurring_data_points, [1.111, -2.45, 1.111, 2.45], 2.222 + ) self.assertAlmostEqualOnAllArrayTypes(sum_of_reoccurring_data_points, [], 0) def test_uniqueness_factor(self): - self.assertAlmostEqualOnAllArrayTypes(ratio_value_number_to_time_series_length, [1, 1, 2, 3, 4], 0.8) - self.assertAlmostEqualOnAllArrayTypes(ratio_value_number_to_time_series_length, [1, 1.5, 2, 3], 1) - self.assertAlmostEqualOnAllArrayTypes(ratio_value_number_to_time_series_length, [1], 1) - self.assertAlmostEqualOnAllArrayTypes(ratio_value_number_to_time_series_length, [1.111, -2.45, 1.111, 2.45], - 0.75) + self.assertAlmostEqualOnAllArrayTypes( + ratio_value_number_to_time_series_length, [1, 1, 2, 3, 4], 0.8 + ) + self.assertAlmostEqualOnAllArrayTypes( + ratio_value_number_to_time_series_length, [1, 1.5, 2, 3], 1 + ) + self.assertAlmostEqualOnAllArrayTypes( + ratio_value_number_to_time_series_length, [1], 1 + ) + self.assertAlmostEqualOnAllArrayTypes( + ratio_value_number_to_time_series_length, [1.111, -2.45, 1.111, 2.45], 0.75 + ) self.assertIsNanOnAllArrayTypes(ratio_value_number_to_time_series_length, []) def test_fft_coefficient(self): x = range(10) - param = [{"coeff": 0, "attr": "real"}, {"coeff": 1, "attr": "real"}, {"coeff": 2, "attr": "real"}, - {"coeff": 0, "attr": "imag"}, {"coeff": 1, "attr": "imag"}, {"coeff": 2, "attr": "imag"}, - {"coeff": 0, "attr": "angle"}, {"coeff": 1, "attr": "angle"}, {"coeff": 2, "attr": "angle"}, - {"coeff": 0, "attr": "abs"}, {"coeff": 1, "attr": "abs"}, {"coeff": 2, "attr": "abs"}] - expected_index = ['attr_"real"__coeff_0', 'attr_"real"__coeff_1', 'attr_"real"__coeff_2', - 'attr_"imag"__coeff_0', 'attr_"imag"__coeff_1', 'attr_"imag"__coeff_2', - 'attr_"angle"__coeff_0', 'attr_"angle"__coeff_1', 'attr_"angle"__coeff_2', - 'attr_"abs"__coeff_0', 'attr_"abs"__coeff_1', 'attr_"abs"__coeff_2'] + param = [ + {"coeff": 0, "attr": "real"}, + {"coeff": 1, "attr": "real"}, + {"coeff": 2, "attr": "real"}, + {"coeff": 0, "attr": "imag"}, + {"coeff": 1, "attr": "imag"}, + {"coeff": 2, "attr": "imag"}, + {"coeff": 0, "attr": "angle"}, + {"coeff": 1, "attr": "angle"}, + {"coeff": 2, "attr": "angle"}, + {"coeff": 0, "attr": "abs"}, + {"coeff": 1, "attr": "abs"}, + {"coeff": 2, "attr": "abs"}, + ] + expected_index = [ + 'attr_"real"__coeff_0', + 'attr_"real"__coeff_1', + 'attr_"real"__coeff_2', + 'attr_"imag"__coeff_0', + 'attr_"imag"__coeff_1', + 'attr_"imag"__coeff_2', + 'attr_"angle"__coeff_0', + 'attr_"angle"__coeff_1', + 'attr_"angle"__coeff_2', + 'attr_"abs"__coeff_0', + 'attr_"abs"__coeff_1', + 'attr_"abs"__coeff_2', + ] res = pd.Series(dict(fft_coefficient(x, param))) self.assertCountEqual(list(res.index), expected_index) @@ -632,9 +896,14 @@ def test_fft_aggregated(self): {"aggtype": "centroid"}, {"aggtype": "variance"}, {"aggtype": "skew"}, - {"aggtype": "kurtosis"} + {"aggtype": "kurtosis"}, + ] + expected_index = [ + 'aggtype_"centroid"', + 'aggtype_"variance"', + 'aggtype_"skew"', + 'aggtype_"kurtosis"', ] - expected_index = ['aggtype_"centroid"', 'aggtype_"variance"', 'aggtype_"skew"', 'aggtype_"kurtosis"'] x = np.arange(10) res = pd.Series(dict(fft_aggregated(x, param))) @@ -659,17 +928,21 @@ def test_fft_aggregated(self): x = np.sin(2 * np.pi / 10 * np.arange(30)) res = pd.Series(dict(fft_aggregated(x, param))) self.assertCountEqual(list(res.index), expected_index) - self.assertAlmostEqual(res['aggtype_"centroid"'], 3., places=5) - self.assertAlmostEqual(res['aggtype_"variance"'], 0., places=5) + self.assertAlmostEqual(res['aggtype_"centroid"'], 3.0, places=5) + self.assertAlmostEqual(res['aggtype_"variance"'], 0.0, places=5) self.assertIsNaN(res['aggtype_"skew"']) self.assertIsNaN(res['aggtype_"kurtosis"']) # Gaussian test: def normal(y, mean_, sigma_): - return 1 / (2 * np.pi * sigma_ ** 2) * np.exp(-(y - mean_) ** 2 / (2 * sigma_ ** 2)) - - mean_ = 500. - sigma_ = 1. + return ( + 1 + / (2 * np.pi * sigma_ ** 2) + * np.exp(-((y - mean_) ** 2) / (2 * sigma_ ** 2)) + ) + + mean_ = 500.0 + sigma_ = 1.0 range_ = int(2 * mean_) x = list(map(lambda x: normal(x, mean_, sigma_), range(range_))) @@ -686,12 +959,14 @@ def normal(y, mean_, sigma_): # Compare against hand calculated values: rel_diff_allowed = 0.02 self.assertAlmostEqual( - res['aggtype_"centroid"'], expected_fft_centroid, - delta=rel_diff_allowed * expected_fft_centroid + res['aggtype_"centroid"'], + expected_fft_centroid, + delta=rel_diff_allowed * expected_fft_centroid, ) self.assertAlmostEqual( - res['aggtype_"variance"'], expected_fft_var, - delta=rel_diff_allowed * expected_fft_var + res['aggtype_"variance"'], + expected_fft_var, + delta=rel_diff_allowed * expected_fft_var, ) def test_number_peaks(self): @@ -736,8 +1011,7 @@ def test_mass_quantile(self): x = [0, 1, 1, 0, 0, 1, 0, 0] param = [{"q": 0.30}, {"q": 0.60}, {"q": 0.90}] - expected_index = ["q_0.3", "q_0.6", - "q_0.9"] + expected_index = ["q_0.3", "q_0.6", "q_0.9"] res = index_mass_quantile(x, param) res = pd.Series(dict(res)) @@ -781,14 +1055,18 @@ def test_spkt_welch_density(self): def test_cwt_coefficients(self): x = [0.1, 0.2, 0.3] - param = [{"widths": (1, 2, 3), "coeff": 2, "w": 1}, - {"widths": (1, 3), "coeff": 2, "w": 3}, - {"widths": (1, 3), "coeff": 5, "w": 3}] + param = [ + {"widths": (1, 2, 3), "coeff": 2, "w": 1}, + {"widths": (1, 3), "coeff": 2, "w": 3}, + {"widths": (1, 3), "coeff": 5, "w": 3}, + ] shuffle(param) - expected_index = ["coeff_2__w_1__widths_(1, 2, 3)", - "coeff_2__w_3__widths_(1, 3)", - "coeff_5__w_3__widths_(1, 3)"] + expected_index = [ + "coeff_2__w_1__widths_(1, 2, 3)", + "coeff_2__w_3__widths_(1, 3)", + "coeff_5__w_3__widths_(1, 3)", + ] res = cwt_coefficients(x, param) res = pd.Series(dict(res)) @@ -816,8 +1094,14 @@ def test_ar_coefficient(self): self.assertAlmostEqual(res["coeff_1__k_1"], 2.5, places=2) # Test for X_i = 1.4 * X_{i-1} - 1 X_{i-2} + 1 - param = [{"k": 1, "coeff": 0}, {"k": 1, "coeff": 1}, - {"k": 2, "coeff": 0}, {"k": 2, "coeff": 1}, {"k": 2, "coeff": 2}, {"k": 2, "coeff": 3}] + param = [ + {"k": 1, "coeff": 0}, + {"k": 1, "coeff": 1}, + {"k": 2, "coeff": 0}, + {"k": 2, "coeff": 1}, + {"k": 2, "coeff": 2}, + {"k": 2, "coeff": 3}, + ] shuffle(param) x = [1, 1] + 5 * [0] @@ -825,9 +1109,14 @@ def test_ar_coefficient(self): x[i] = (-2) * x[i - 2] + 3.5 * x[i - 1] + 1 res = ar_coefficient(x, param) - expected_index = ["coeff_0__k_1", "coeff_1__k_1", - "coeff_0__k_2", "coeff_1__k_2", - "coeff_2__k_2", "coeff_3__k_2"] + expected_index = [ + "coeff_0__k_1", + "coeff_1__k_1", + "coeff_0__k_2", + "coeff_1__k_2", + "coeff_2__k_2", + "coeff_3__k_2", + ] res = pd.Series(dict(res)) @@ -840,16 +1129,30 @@ def test_ar_coefficient(self): def test_time_reversal_asymmetry_statistic(self): x = [1] * 10 - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 0) - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 1) - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 2) - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 3) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 0 + ) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 1 + ) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 2 + ) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 3 + ) x = [1, 2, -3, 4] # 1/2 * ( (4^2 * -3 + 3 * 2^2) + (3^2*2)-(2*1^1)) = 1/2 * (-48+12+18-2) = 20/2 - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, -10, 1) - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 2) - self.assertAlmostEqualOnAllArrayTypes(time_reversal_asymmetry_statistic, x, 0, 3) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, -10, 1 + ) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 2 + ) + self.assertAlmostEqualOnAllArrayTypes( + time_reversal_asymmetry_statistic, x, 0, 3 + ) def test_number_crossing_m(self): x = [10, -10, 10, -10] @@ -875,20 +1178,135 @@ def test_c3(self): def test_binned_entropy(self): self.assertAlmostEqualOnAllArrayTypes(binned_entropy, [10] * 100, 0, 10) - self.assertAlmostEqualOnAllArrayTypes(binned_entropy, [10] * 10 + [1], - (10 / 11 * np.math.log(10 / 11) + - 1 / 11 * np.math.log(1 / 11)), 10) - self.assertAlmostEqualOnAllArrayTypes(binned_entropy, [10] * 10 + [1], - (10 / 11 * np.math.log(10 / 11) + - 1 / 11 * np.math.log(1 / 11)), 10) - self.assertAlmostEqualOnAllArrayTypes(binned_entropy, [10] * 10 + [1], - (10 / 11 * np.math.log(10 / 11) + - 1 / 11 * np.math.log(1 / 11)), 100) - self.assertAlmostEqualOnAllArrayTypes(binned_entropy, list(range(10)), - np.math.log(1 / 10), 100) - self.assertAlmostEqualOnAllArrayTypes(binned_entropy, list(range(100)), - np.math.log(1 / 2), 2) + self.assertAlmostEqualOnAllArrayTypes( + binned_entropy, + [10] * 10 + [1], + -(10 / 11 * np.math.log(10 / 11) + 1 / 11 * np.math.log(1 / 11)), + 10, + ) + self.assertAlmostEqualOnAllArrayTypes( + binned_entropy, + [10] * 10 + [1], + -(10 / 11 * np.math.log(10 / 11) + 1 / 11 * np.math.log(1 / 11)), + 10, + ) + self.assertAlmostEqualOnAllArrayTypes( + binned_entropy, + [10] * 10 + [1], + -(10 / 11 * np.math.log(10 / 11) + 1 / 11 * np.math.log(1 / 11)), + 100, + ) + self.assertAlmostEqualOnAllArrayTypes( + binned_entropy, list(range(10)), -np.math.log(1 / 10), 100 + ) + self.assertAlmostEqualOnAllArrayTypes( + binned_entropy, list(range(100)), -np.math.log(1 / 2), 2 + ) def test_sample_entropy(self): # "random" list -> large entropy - ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1, - 3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7, - 8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8] + ts = [ + 1, + 4, + 5, + 1, + 7, + 3, + 1, + 2, + 5, + 8, + 9, + 7, + 3, + 7, + 9, + 5, + 4, + 3, + 9, + 1, + 2, + 3, + 4, + 2, + 9, + 6, + 7, + 4, + 9, + 2, + 9, + 9, + 6, + 5, + 1, + 3, + 8, + 1, + 5, + 3, + 8, + 4, + 1, + 2, + 2, + 1, + 6, + 5, + 3, + 6, + 5, + 4, + 8, + 9, + 6, + 7, + 5, + 3, + 2, + 5, + 4, + 2, + 5, + 1, + 6, + 5, + 3, + 5, + 6, + 7, + 8, + 5, + 2, + 8, + 6, + 3, + 8, + 2, + 7, + 1, + 7, + 3, + 5, + 6, + 2, + 1, + 3, + 7, + 3, + 5, + 3, + 7, + 6, + 7, + 7, + 2, + 3, + 1, + 7, + 8, + ] self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.38262780) # This is not very complex, so it gives a small value ts = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] @@ -910,11 +1328,17 @@ def test_sample_entropy(self): self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.0010314596066622707) def test_autocorrelation(self): - self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1) + self.assertAlmostEqualOnAllArrayTypes( + autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1 + ) self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], 1, 2) - self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 3) + self.assertAlmostEqualOnAllArrayTypes( + autocorrelation, [1, 2, 1, 2, 1, 2], -1, 3 + ) self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], 1, 4) - self.assertAlmostEqualOnAllArrayTypes(autocorrelation, pd.Series([0, 1, 2, 0, 1, 2]), -0.75, 2) + self.assertAlmostEqualOnAllArrayTypes( + autocorrelation, pd.Series([0, 1, 2, 0, 1, 2]), -0.75, 2 + ) # Autocorrelation lag is larger than length of the time series self.assertIsNanOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], 200) self.assertIsNanOnAllArrayTypes(autocorrelation, [np.nan], 0) @@ -923,87 +1347,220 @@ def test_autocorrelation(self): self.assertIsNanOnAllArrayTypes(autocorrelation, [1], 0) def test_quantile(self): - self.assertAlmostEqualOnAllArrayTypes(quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 1.0, 0.2) - self.assertAlmostEqualOnAllArrayTypes(quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 13, 0.9) - self.assertAlmostEqualOnAllArrayTypes(quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 13, 1.0) + self.assertAlmostEqualOnAllArrayTypes( + quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 1.0, 0.2 + ) + self.assertAlmostEqualOnAllArrayTypes( + quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 13, 0.9 + ) + self.assertAlmostEqualOnAllArrayTypes( + quantile, [1, 1, 1, 3, 4, 7, 9, 11, 13, 13], 13, 1.0 + ) self.assertAlmostEqualOnAllArrayTypes(quantile, [1], 1, 0.5) self.assertIsNanOnAllArrayTypes(quantile, [], 0.5) def test_mean_abs_change_quantiles(self): - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 1, - ql=0.1, qh=0.9, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 0, - ql=0.15, qh=0.18, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0.5, - ql=0, qh=1, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0.5, - ql=0.1, qh=1, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0, - ql=0.1, qh=0.6, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0], 5, - ql=0, qh=1, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0], 0.5, - ql=0.1, qh=1, isabs=True, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0, 1, 0], 0.75, - ql=0.1, qh=1, isabs=True, f_agg="mean") - - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 1, - ql=0.1, qh=0.9, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 0, - ql=0.15, qh=0.18, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0, - ql=0, qh=1, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0, - ql=0.1, qh=1, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0, - ql=0.1, qh=0.6, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0], 0, - ql=0, qh=1, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0], 0.5, - ql=0.1, qh=1, isabs=False, f_agg="mean") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, -9, 0, 0, 1, 0], 0.25, - ql=0.1, qh=1, isabs=False, f_agg="mean") - - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 0, - ql=0.1, qh=0.9, isabs=True, f_agg="std") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 0, 0], 0.5, - ql=0, qh=1, isabs=True, f_agg="std") - - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, list(range(10)), 0, - ql=0.1, qh=0.9, isabs=False, f_agg="std") - self.assertAlmostEqualOnAllArrayTypes(change_quantiles, [0, 1, 0, 1, 0], 1, - ql=0, qh=1, isabs=False, f_agg="std") + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 1, + ql=0.1, + qh=0.9, + isabs=True, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 0, + ql=0.15, + qh=0.18, + isabs=True, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, 0, 0, 0], 0.5, ql=0, qh=1, isabs=True, f_agg="mean" + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, 0, 0, 0], + 0.5, + ql=0.1, + qh=1, + isabs=True, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, 0, 0, 0], + 0, + ql=0.1, + qh=0.6, + isabs=True, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, -9, 0, 0], 5, ql=0, qh=1, isabs=True, f_agg="mean" + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, -9, 0, 0], + 0.5, + ql=0.1, + qh=1, + isabs=True, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, -9, 0, 0, 1, 0], + 0.75, + ql=0.1, + qh=1, + isabs=True, + f_agg="mean", + ) + + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 1, + ql=0.1, + qh=0.9, + isabs=False, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 0, + ql=0.15, + qh=0.18, + isabs=False, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, 0, 0, 0], 0, ql=0, qh=1, isabs=False, f_agg="mean" + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, 0, 0, 0], + 0, + ql=0.1, + qh=1, + isabs=False, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, 0, 0, 0], + 0, + ql=0.1, + qh=0.6, + isabs=False, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, -9, 0, 0], 0, ql=0, qh=1, isabs=False, f_agg="mean" + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, -9, 0, 0], + 0.5, + ql=0.1, + qh=1, + isabs=False, + f_agg="mean", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + [0, 1, -9, 0, 0, 1, 0], + 0.25, + ql=0.1, + qh=1, + isabs=False, + f_agg="mean", + ) + + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 0, + ql=0.1, + qh=0.9, + isabs=True, + f_agg="std", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, 0, 0, 0], 0.5, ql=0, qh=1, isabs=True, f_agg="std" + ) + + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, + list(range(10)), + 0, + ql=0.1, + qh=0.9, + isabs=False, + f_agg="std", + ) + self.assertAlmostEqualOnAllArrayTypes( + change_quantiles, [0, 1, 0, 1, 0], 1, ql=0, qh=1, isabs=False, f_agg="std" + ) def test_value_count(self): self.assertEqualPandasSeriesWrapper(value_count, [1] * 10, 10, value=1) self.assertEqualPandasSeriesWrapper(value_count, list(range(10)), 1, value=0) self.assertEqualPandasSeriesWrapper(value_count, [1] * 10, 0, value=0) self.assertEqualPandasSeriesWrapper(value_count, [np.NaN, 0, 1] * 3, 3, value=0) - self.assertEqualPandasSeriesWrapper(value_count, [np.NINF, 0, 1] * 3, 3, value=0) - self.assertEqualPandasSeriesWrapper(value_count, [np.PINF, 0, 1] * 3, 3, value=0) - self.assertEqualPandasSeriesWrapper(value_count, [0.1, 0.2, 0.3] * 3, 3, value=0.2) - self.assertEqualPandasSeriesWrapper(value_count, [np.NaN, 0, 1] * 3, 3, value=np.NaN) - self.assertEqualPandasSeriesWrapper(value_count, [np.NINF, 0, 1] * 3, 3, value=np.NINF) - self.assertEqualPandasSeriesWrapper(value_count, [np.PINF, 0, 1] * 3, 3, value=np.PINF) + self.assertEqualPandasSeriesWrapper( + value_count, [np.NINF, 0, 1] * 3, 3, value=0 + ) + self.assertEqualPandasSeriesWrapper( + value_count, [np.PINF, 0, 1] * 3, 3, value=0 + ) + self.assertEqualPandasSeriesWrapper( + value_count, [0.1, 0.2, 0.3] * 3, 3, value=0.2 + ) + self.assertEqualPandasSeriesWrapper( + value_count, [np.NaN, 0, 1] * 3, 3, value=np.NaN + ) + self.assertEqualPandasSeriesWrapper( + value_count, [np.NINF, 0, 1] * 3, 3, value=np.NINF + ) + self.assertEqualPandasSeriesWrapper( + value_count, [np.PINF, 0, 1] * 3, 3, value=np.PINF + ) def test_range_count(self): self.assertEqualPandasSeriesWrapper(range_count, [1] * 10, 0, min=1, max=1) self.assertEqualPandasSeriesWrapper(range_count, [1] * 10, 0, min=0.9, max=1) self.assertEqualPandasSeriesWrapper(range_count, [1] * 10, 10, min=1, max=1.1) - self.assertEqualPandasSeriesWrapper(range_count, list(range(10)), 9, min=0, max=9) - self.assertEqualPandasSeriesWrapper(range_count, list(range(10)), 10, min=0, max=10) - self.assertEqualPandasSeriesWrapper(range_count, list(range(0, -10, -1)), 9, min=-10, max=0) - self.assertEqualPandasSeriesWrapper(range_count, [np.NaN, np.PINF, np.NINF] + list(range(10)), 10, min=0, - max=10) + self.assertEqualPandasSeriesWrapper( + range_count, list(range(10)), 9, min=0, max=9 + ) + self.assertEqualPandasSeriesWrapper( + range_count, list(range(10)), 10, min=0, max=10 + ) + self.assertEqualPandasSeriesWrapper( + range_count, list(range(0, -10, -1)), 9, min=-10, max=0 + ) + self.assertEqualPandasSeriesWrapper( + range_count, [np.NaN, np.PINF, np.NINF] + list(range(10)), 10, min=0, max=10 + ) def test_approximate_entropy(self): self.assertEqualOnAllArrayTypes(approximate_entropy, [1], 0, m=2, r=0.5) self.assertEqualOnAllArrayTypes(approximate_entropy, [1, 2], 0, m=2, r=0.5) self.assertEqualOnAllArrayTypes(approximate_entropy, [1, 2, 3], 0, m=2, r=0.5) self.assertEqualOnAllArrayTypes(approximate_entropy, [1, 2, 3], 0, m=2, r=0.5) - self.assertAlmostEqualOnAllArrayTypes(approximate_entropy, [12, 13, 15, 16, 17] * 10, 0.282456191, m=2, r=0.9) - self.assertRaises(ValueError, approximate_entropy, x=[12, 13, 15, 16, 17] * 10, m=2, r=-0.5) + self.assertAlmostEqualOnAllArrayTypes( + approximate_entropy, [12, 13, 15, 16, 17] * 10, 0.282456191, m=2, r=0.9 + ) + self.assertRaises( + ValueError, approximate_entropy, x=[12, 13, 15, 16, 17] * 10, m=2, r=-0.5 + ) def test_absolute_maximum(self): self.assertEqualOnAllArrayTypes(absolute_maximum, [-5, 0, 1], 5) @@ -1031,21 +1588,31 @@ def test_max_langevin_fixed_point(self): def test_linear_trend(self): # check linear up trend x = range(10) - param = [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, {"attr": "slope"}, {"attr": "stderr"}] + param = [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ] res = linear_trend(x, param) res = pd.Series(dict(res)) - expected_index = ["attr_\"pvalue\"", "attr_\"intercept\"", - "attr_\"rvalue\"", "attr_\"slope\"", - "attr_\"stderr\""] + expected_index = [ + 'attr_"pvalue"', + 'attr_"intercept"', + 'attr_"rvalue"', + 'attr_"slope"', + 'attr_"stderr"', + ] self.assertEqual(len(res), 5) self.assertCountEqual(list(res.index), expected_index) - self.assertAlmostEqual(res["attr_\"pvalue\""], 0) - self.assertAlmostEqual(res["attr_\"stderr\""], 0) - self.assertAlmostEqual(res["attr_\"intercept\""], 0) - self.assertAlmostEqual(res["attr_\"slope\""], 1.0) + self.assertAlmostEqual(res['attr_"pvalue"'], 0) + self.assertAlmostEqual(res['attr_"stderr"'], 0) + self.assertAlmostEqual(res['attr_"intercept"'], 0) + self.assertAlmostEqual(res['attr_"slope"'], 1.0) # check p value for random trend np.random.seed(42) @@ -1055,7 +1622,7 @@ def test_linear_trend(self): res = pd.Series(dict(res)) - self.assertLess(abs(res["attr_\"rvalue\""]), 0.1) + self.assertLess(abs(res['attr_"rvalue"']), 0.1) # check slope and intercept decreasing trend with intercept x = [42 - 2 * x for x in range(10)] @@ -1064,48 +1631,88 @@ def test_linear_trend(self): res = pd.Series(dict(res)) - self.assertAlmostEqual(res["attr_\"intercept\""], 42) - self.assertAlmostEqual(res["attr_\"slope\""], -2) + self.assertAlmostEqual(res['attr_"intercept"'], 42) + self.assertAlmostEqual(res['attr_"slope"'], -2) def test__aggregate_on_chunks(self): - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="max", chunk_len=2), [1, 3]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([1, 1, 3, 3]), f_agg="max", chunk_len=2), [1, 3]) - - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="min", chunk_len=2), [0, 2]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, 3, 5]), f_agg="min", chunk_len=2), [0, 2, 5]) - - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="mean", chunk_len=2), - [0.5, 2.5]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 0, 4, 5]), f_agg="mean", chunk_len=2), - [0.5, 2, 5]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 0, 4, 5]), f_agg="mean", chunk_len=3), - [1 / 3, 4.5]) - - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, 3, 5, -2]), - f_agg="median", chunk_len=2), [0.5, 2.5, 1.5]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([-10, 5, 3, -3, 4, -6]), - f_agg="median", chunk_len=3), [3, -3]) - self.assertListEqual(_aggregate_on_chunks(x=pd.Series([0, 1, 2, np.NaN, 5]), - f_agg="median", chunk_len=2), [0.5, 2, 5]) + self.assertListEqual( + _aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="max", chunk_len=2), + [1, 3], + ) + self.assertListEqual( + _aggregate_on_chunks(x=pd.Series([1, 1, 3, 3]), f_agg="max", chunk_len=2), + [1, 3], + ) + + self.assertListEqual( + _aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="min", chunk_len=2), + [0, 2], + ) + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([0, 1, 2, 3, 5]), f_agg="min", chunk_len=2 + ), + [0, 2, 5], + ) + + self.assertListEqual( + _aggregate_on_chunks(x=pd.Series([0, 1, 2, 3]), f_agg="mean", chunk_len=2), + [0.5, 2.5], + ) + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([0, 1, 0, 4, 5]), f_agg="mean", chunk_len=2 + ), + [0.5, 2, 5], + ) + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([0, 1, 0, 4, 5]), f_agg="mean", chunk_len=3 + ), + [1 / 3, 4.5], + ) + + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([0, 1, 2, 3, 5, -2]), f_agg="median", chunk_len=2 + ), + [0.5, 2.5, 1.5], + ) + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([-10, 5, 3, -3, 4, -6]), f_agg="median", chunk_len=3 + ), + [3, -3], + ) + self.assertListEqual( + _aggregate_on_chunks( + x=pd.Series([0, 1, 2, np.NaN, 5]), f_agg="median", chunk_len=2 + ), + [0.5, 2, 5], + ) def test_agg_linear_trend(self): x = pd.Series(range(9), index=range(9)) - param = [{"attr": "intercept", "chunk_len": 3, "f_agg": "max"}, - {"attr": "slope", "chunk_len": 3, "f_agg": "max"}, - {"attr": "intercept", "chunk_len": 3, "f_agg": "min"}, - {"attr": "slope", "chunk_len": 3, "f_agg": "min"}, - {"attr": "intercept", "chunk_len": 3, "f_agg": "mean"}, - {"attr": "slope", "chunk_len": 3, "f_agg": "mean"}, - {"attr": "intercept", "chunk_len": 3, "f_agg": "median"}, - {"attr": "slope", "chunk_len": 3, "f_agg": "median"}] - expected_index = ['attr_"intercept"__chunk_len_3__f_agg_"max"', - 'attr_"slope"__chunk_len_3__f_agg_"max"', - 'attr_"intercept"__chunk_len_3__f_agg_"min"', - 'attr_"slope"__chunk_len_3__f_agg_"min"', - 'attr_"intercept"__chunk_len_3__f_agg_"mean"', - 'attr_"slope"__chunk_len_3__f_agg_"mean"', - 'attr_"intercept"__chunk_len_3__f_agg_"median"', - 'attr_"slope"__chunk_len_3__f_agg_"median"'] + param = [ + {"attr": "intercept", "chunk_len": 3, "f_agg": "max"}, + {"attr": "slope", "chunk_len": 3, "f_agg": "max"}, + {"attr": "intercept", "chunk_len": 3, "f_agg": "min"}, + {"attr": "slope", "chunk_len": 3, "f_agg": "min"}, + {"attr": "intercept", "chunk_len": 3, "f_agg": "mean"}, + {"attr": "slope", "chunk_len": 3, "f_agg": "mean"}, + {"attr": "intercept", "chunk_len": 3, "f_agg": "median"}, + {"attr": "slope", "chunk_len": 3, "f_agg": "median"}, + ] + expected_index = [ + 'attr_"intercept"__chunk_len_3__f_agg_"max"', + 'attr_"slope"__chunk_len_3__f_agg_"max"', + 'attr_"intercept"__chunk_len_3__f_agg_"min"', + 'attr_"slope"__chunk_len_3__f_agg_"min"', + 'attr_"intercept"__chunk_len_3__f_agg_"mean"', + 'attr_"slope"__chunk_len_3__f_agg_"mean"', + 'attr_"intercept"__chunk_len_3__f_agg_"median"', + 'attr_"slope"__chunk_len_3__f_agg_"median"', + ] res = agg_linear_trend(x=x, param=param) @@ -1193,120 +1800,191 @@ def test_linear_trend_timewise_hours(self): """Test linear_trend_timewise function with hour intervals.""" x = pd.Series( [0, 1, 3, 6], - index=pd.DatetimeIndex([ - '2018-01-01 04:00:00', '2018-01-01 05:00:00', - '2018-01-01 07:00:00', '2018-01-01 10:00:00' - ]), + index=pd.DatetimeIndex( + [ + "2018-01-01 04:00:00", + "2018-01-01 05:00:00", + "2018-01-01 07:00:00", + "2018-01-01 10:00:00", + ] + ), ) - param = [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, {"attr": "slope"}, {"attr": "stderr"}] + param = [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ] res = linear_trend_timewise(x, param) res = pd.Series(dict(res)) - expected_index = ["attr_\"pvalue\"", "attr_\"intercept\"", - "attr_\"rvalue\"", "attr_\"slope\"", - "attr_\"stderr\""] + expected_index = [ + 'attr_"pvalue"', + 'attr_"intercept"', + 'attr_"rvalue"', + 'attr_"slope"', + 'attr_"stderr"', + ] self.assertEqual(len(res), 5) self.assertCountEqual(list(res.index), expected_index) - self.assertAlmostEqual(res["attr_\"pvalue\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"stderr\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"intercept\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"slope\""], 1.0, places=3) + self.assertAlmostEqual(res['attr_"pvalue"'], 0, places=3) + self.assertAlmostEqual(res['attr_"stderr"'], 0, places=3) + self.assertAlmostEqual(res['attr_"intercept"'], 0, places=3) + self.assertAlmostEqual(res['attr_"slope"'], 1.0, places=3) def test_linear_trend_timewise_days(self): """Test linear_trend_timewise function with day intervals.""" # Try with different days x = pd.Series( [0, 24, 48, 72], - index=pd.DatetimeIndex([ - '2018-01-01 04:00:00', '2018-01-02 04:00:00', - '2018-01-03 04:00:00', '2018-01-04 04:00:00' - ]), + index=pd.DatetimeIndex( + [ + "2018-01-01 04:00:00", + "2018-01-02 04:00:00", + "2018-01-03 04:00:00", + "2018-01-04 04:00:00", + ] + ), ) - param = [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, {"attr": "slope"}, {"attr": "stderr"}] + param = [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ] res = linear_trend_timewise(x, param) res = pd.Series(dict(res)) - self.assertAlmostEqual(res["attr_\"pvalue\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"stderr\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"intercept\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"slope\""], 1.0, places=3) + self.assertAlmostEqual(res['attr_"pvalue"'], 0, places=3) + self.assertAlmostEqual(res['attr_"stderr"'], 0, places=3) + self.assertAlmostEqual(res['attr_"intercept"'], 0, places=3) + self.assertAlmostEqual(res['attr_"slope"'], 1.0, places=3) def test_linear_trend_timewise_seconds(self): """Test linear_trend_timewise function with second intervals.""" # Try with different days x = pd.Series( [0, 1 / float(3600), 2 / float(3600), 3 / float(3600)], - index=pd.DatetimeIndex([ - '2018-01-01 04:00:01', '2018-01-01 04:00:02', - '2018-01-01 04:00:03', '2018-01-01 04:00:04' - ]), + index=pd.DatetimeIndex( + [ + "2018-01-01 04:00:01", + "2018-01-01 04:00:02", + "2018-01-01 04:00:03", + "2018-01-01 04:00:04", + ] + ), ) - param = [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, {"attr": "slope"}, {"attr": "stderr"}] + param = [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ] res = linear_trend_timewise(x, param) res = pd.Series(dict(res)) - self.assertAlmostEqual(res["attr_\"pvalue\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"stderr\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"intercept\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"slope\""], 1.0, places=3) + self.assertAlmostEqual(res['attr_"pvalue"'], 0, places=3) + self.assertAlmostEqual(res['attr_"stderr"'], 0, places=3) + self.assertAlmostEqual(res['attr_"intercept"'], 0, places=3) + self.assertAlmostEqual(res['attr_"slope"'], 1.0, places=3) def test_linear_trend_timewise_years(self): """Test linear_trend_timewise function with year intervals.""" # Try with different days x = pd.Series( - [0, 365 * 24, 365 * 48, 365 * 72 + 24], # Add 24 to the last one since it's a leap year - index=pd.DatetimeIndex([ - '2018-01-01 04:00:00', '2019-01-01 04:00:00', - '2020-01-01 04:00:00', '2021-01-01 04:00:00' - ]), + [ + 0, + 365 * 24, + 365 * 48, + 365 * 72 + 24, + ], # Add 24 to the last one since it's a leap year + index=pd.DatetimeIndex( + [ + "2018-01-01 04:00:00", + "2019-01-01 04:00:00", + "2020-01-01 04:00:00", + "2021-01-01 04:00:00", + ] + ), ) - param = [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, {"attr": "slope"}, {"attr": "stderr"}] + param = [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ] res = linear_trend_timewise(x, param) res = pd.Series(dict(res)) - self.assertAlmostEqual(res["attr_\"pvalue\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"stderr\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"intercept\""], 0, places=3) - self.assertAlmostEqual(res["attr_\"slope\""], 1.0, places=3) + self.assertAlmostEqual(res['attr_"pvalue"'], 0, places=3) + self.assertAlmostEqual(res['attr_"stderr"'], 0, places=3) + self.assertAlmostEqual(res['attr_"intercept"'], 0, places=3) + self.assertAlmostEqual(res['attr_"slope"'], 1.0, places=3) def test_change_quantiles(self): """Test change_quantiles function when changing from `sum` to `np.sum`.""" np.random.seed(0) - res = change_quantiles(np.random.rand(10000) * 1000, 0.1, 0.2, False, 'mean') + res = change_quantiles(np.random.rand(10000) * 1000, 0.1, 0.2, False, "mean") self.assertAlmostEqual(res, -0.9443846621365727) def test_count_above(self): self.assertEqualPandasSeriesWrapper(count_above, [1] * 10, 1, t=1) self.assertEqualPandasSeriesWrapper(count_above, list(range(10)), 1, t=0) self.assertEqualPandasSeriesWrapper(count_above, list(range(10)), 0.5, t=5) - self.assertEqualPandasSeriesWrapper(count_above, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2) + self.assertEqualPandasSeriesWrapper( + count_above, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2 + ) self.assertEqualPandasSeriesWrapper(count_above, [np.NaN, 0, 1] * 3, 2 / 3, t=0) - self.assertEqualPandasSeriesWrapper(count_above, [np.NINF, 0, 1] * 3, 2 / 3, t=0) + self.assertEqualPandasSeriesWrapper( + count_above, [np.NINF, 0, 1] * 3, 2 / 3, t=0 + ) self.assertEqualPandasSeriesWrapper(count_above, [np.PINF, 0, 1] * 3, 1, t=0) - self.assertEqualPandasSeriesWrapper(count_above, [np.NaN, 0, 1] * 3, 0, t=np.NaN) - self.assertEqualPandasSeriesWrapper(count_above, [np.NINF, 0, np.PINF] * 3, 1, t=np.NINF) - self.assertEqualPandasSeriesWrapper(count_above, [np.PINF, 0, 1] * 3, 1 / 3, t=np.PINF) + self.assertEqualPandasSeriesWrapper( + count_above, [np.NaN, 0, 1] * 3, 0, t=np.NaN + ) + self.assertEqualPandasSeriesWrapper( + count_above, [np.NINF, 0, np.PINF] * 3, 1, t=np.NINF + ) + self.assertEqualPandasSeriesWrapper( + count_above, [np.PINF, 0, 1] * 3, 1 / 3, t=np.PINF + ) def test_count_below(self): self.assertEqualPandasSeriesWrapper(count_below, [1] * 10, 1, t=1) self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 1 / 10, t=0) self.assertEqualPandasSeriesWrapper(count_below, list(range(10)), 6 / 10, t=5) - self.assertEqualPandasSeriesWrapper(count_below, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2) + self.assertEqualPandasSeriesWrapper( + count_below, [0.1, 0.2, 0.3] * 3, 2 / 3, t=0.2 + ) self.assertEqualPandasSeriesWrapper(count_below, [np.NaN, 0, 1] * 3, 1 / 3, t=0) - self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, 1] * 3, 2 / 3, t=0) - self.assertEqualPandasSeriesWrapper(count_below, [np.PINF, 0, 1] * 3, 1 / 3, t=0) - self.assertEqualPandasSeriesWrapper(count_below, [np.NaN, 0, 1] * 3, 0, t=np.NaN) - self.assertEqualPandasSeriesWrapper(count_below, [np.NINF, 0, np.PINF] * 3, 1 / 3, t=np.NINF) - self.assertEqualPandasSeriesWrapper(count_below, [np.PINF, 0, 1] * 3, 1, t=np.PINF) + self.assertEqualPandasSeriesWrapper( + count_below, [np.NINF, 0, 1] * 3, 2 / 3, t=0 + ) + self.assertEqualPandasSeriesWrapper( + count_below, [np.PINF, 0, 1] * 3, 1 / 3, t=0 + ) + self.assertEqualPandasSeriesWrapper( + count_below, [np.NaN, 0, 1] * 3, 0, t=np.NaN + ) + self.assertEqualPandasSeriesWrapper( + count_below, [np.NINF, 0, np.PINF] * 3, 1 / 3, t=np.NINF + ) + self.assertEqualPandasSeriesWrapper( + count_below, [np.PINF, 0, 1] * 3, 1, t=np.PINF + ) def test_benford_correlation(self): # A test with list of random values @@ -1322,7 +2000,19 @@ def test_benford_correlation(self): equal_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] # A list containing NaN - list_with_nan = [1.354, 0.058, 0.055, 0.99, 3.15, np.nan, 0.3, 2.3, 0, 0.59, 0.74] + list_with_nan = [ + 1.354, + 0.058, + 0.055, + 0.99, + 3.15, + np.nan, + 0.3, + 2.3, + 0, + 0.59, + 0.74, + ] self.assertAlmostEqual(benford_correlation(random_list), 0.39458056) self.assertAlmostEqual(benford_correlation(fibonacci_list), 0.998003988) @@ -1354,18 +2044,18 @@ def test_query_similarity_count(self): def test_matrix_profile_window(self): # Test matrix profile output with specified window np.random.seed(9999) - ts = np.random.uniform(size=2**10) - w = 2**5 + ts = np.random.uniform(size=2 ** 10) + w = 2 ** 5 subq = ts[0:w] ts[0:w] = subq - ts[w+100:w+100+w] = subq + ts[w + 100 : w + 100 + w] = subq param = [ {"threshold": 0.98, "windows": 36, "feature": "min"}, {"threshold": 0.98, "windows": 36, "feature": "max"}, {"threshold": 0.98, "windows": 36, "feature": "mean"}, {"threshold": 0.98, "windows": 36, "feature": "median"}, {"threshold": 0.98, "windows": 36, "feature": "25"}, - {"threshold": 0.98, "windows": 36, "feature": "75"} + {"threshold": 0.98, "windows": 36, "feature": "75"}, ] self.assertAlmostEqual(matrix_profile(ts, param=param)[0][1], 2.825786727580335) @@ -1373,11 +2063,11 @@ def test_matrix_profile_window(self): def test_matrix_profile_no_window(self): # Test matrix profile output with no window specified np.random.seed(9999) - ts = np.random.uniform(size=2**10) - w = 2**5 + ts = np.random.uniform(size=2 ** 10) + w = 2 ** 5 subq = ts[0:w] ts[0:w] = subq - ts[w+100:w+100+w] = subq + ts[w + 100 : w + 100 + w] = subq param = [ {"threshold": 0.98, "feature": "min"}, @@ -1385,7 +2075,7 @@ def test_matrix_profile_no_window(self): {"threshold": 0.98, "feature": "mean"}, {"threshold": 0.98, "feature": "median"}, {"threshold": 0.98, "feature": "25"}, - {"threshold": 0.98, "feature": "75"} + {"threshold": 0.98, "feature": "75"}, ] # Test matrix profile output with no window specified @@ -1393,7 +2083,7 @@ def test_matrix_profile_no_window(self): def test_matrix_profile_nan(self): # Test matrix profile of NaNs (NaN output) - ts = np.random.uniform(size=2**6) + ts = np.random.uniform(size=2 ** 6) ts[:] = np.nan param = [ @@ -1402,14 +2092,13 @@ def test_matrix_profile_nan(self): {"threshold": 0.98, "windows": None, "feature": "mean"}, {"threshold": 0.98, "windows": None, "feature": "median"}, {"threshold": 0.98, "windows": None, "feature": "25"}, - {"threshold": 0.98, "windows": None, "feature": "75"} + {"threshold": 0.98, "windows": None, "feature": "75"}, ] self.assertTrue(np.isnan(matrix_profile(ts, param=param)[0][1])) class FriedrichTestCase(TestCase): - def test_estimate_friedrich_coefficients(self): """ Estimate friedrich coefficients @@ -1434,13 +2123,24 @@ def test_friedrich_coefficients(self): x = np.zeros(100) res = pd.Series(dict(friedrich_coefficients(x, param))) - expected_index = ["coeff_0__m_2__r_30", "coeff_1__m_2__r_30", "coeff_2__m_2__r_30", "coeff_3__m_2__r_30"] + expected_index = [ + "coeff_0__m_2__r_30", + "coeff_1__m_2__r_30", + "coeff_2__m_2__r_30", + "coeff_3__m_2__r_30", + ] self.assertCountEqual(list(res.index), expected_index) self.assertTrue(np.sum(np.isnan(res)), 3) - def test_friedrich_number_of_returned_features_is_equal_to_number_of_parameters(self): - """ unit test for issue 501 """ - param = [{'m': 3, 'r': 5, 'coeff': 2}, {'m': 3, 'r': 5, 'coeff': 3}, {'m': 3, 'r': 2, 'coeff': 3}] + def test_friedrich_number_of_returned_features_is_equal_to_number_of_parameters( + self, + ): + """unit test for issue 501""" + param = [ + {"m": 3, "r": 5, "coeff": 2}, + {"m": 3, "r": 5, "coeff": 3}, + {"m": 3, "r": 2, "coeff": 3}, + ] x = np.zeros(100) res = pd.Series(dict(friedrich_coefficients(x, param))) @@ -1450,13 +2150,43 @@ def test_friedrich_number_of_returned_features_is_equal_to_number_of_parameters( def test_friedrich_equal_to_snapshot(self): param = [{"coeff": coeff, "m": 2, "r": 30} for coeff in range(4)] - x = np.array([-0.53, -0.61, -1.26, -0.88, -0.34, 0.58, 2.86, -0.47, 0.78, - -0.45, -0.27, 0.43, 1.72, 0.26, 1.02, -0.09, 0.65, 1.49, - -0.95, -1.02, -0.64, -1.63, -0.71, -0.43, -1.69, 0.05, 1.58, - 1.1, 0.55, -1.02]) + x = np.array( + [ + -0.53, + -0.61, + -1.26, + -0.88, + -0.34, + 0.58, + 2.86, + -0.47, + 0.78, + -0.45, + -0.27, + 0.43, + 1.72, + 0.26, + 1.02, + -0.09, + 0.65, + 1.49, + -0.95, + -1.02, + -0.64, + -1.63, + -0.71, + -0.43, + -1.69, + 0.05, + 1.58, + 1.1, + 0.55, + -1.02, + ] + ) res = pd.Series(dict(friedrich_coefficients(x, param))) - self.assertAlmostEqual(res['coeff_0__m_2__r_30'], -0.24536975738843042) - self.assertAlmostEqual(res['coeff_1__m_2__r_30'], -0.533309548662685) - self.assertAlmostEqual(res['coeff_2__m_2__r_30'], 0.2759399238199404) + self.assertAlmostEqual(res["coeff_0__m_2__r_30"], -0.24536975738843042) + self.assertAlmostEqual(res["coeff_1__m_2__r_30"], -0.533309548662685) + self.assertAlmostEqual(res["coeff_2__m_2__r_30"], 0.2759399238199404) diff --git a/tests/units/feature_extraction/test_settings.py b/tests/units/feature_extraction/test_settings.py index 99f744927..3dc0a6676 100644 --- a/tests/units/feature_extraction/test_settings.py +++ b/tests/units/feature_extraction/test_settings.py @@ -8,8 +8,15 @@ import numpy as np import pandas as pd from tsfresh.feature_extraction.extraction import extract_features -from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, \ - EfficientFCParameters, from_columns, TimeBasedFCParameters, IndexBasedFCParameters, PickeableSettings +from tsfresh.feature_extraction.settings import ( + ComprehensiveFCParameters, + MinimalFCParameters, + EfficientFCParameters, + from_columns, + TimeBasedFCParameters, + IndexBasedFCParameters, + PickeableSettings, +) from tsfresh.feature_extraction import feature_calculators from pandas.testing import assert_frame_equal @@ -37,52 +44,94 @@ def test_from_column_correct_for_selected_columns(self): tsn = "TEST_TIME_SERIES" # Aggregate functions - feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] + feature_names = [ + tsn + "__sum_values", + tsn + "__median", + tsn + "__length", + tsn + "__sample_entropy", + ] # Aggregate functions with params - feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', - tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', - tsn + '__value_count__value_nan'] + feature_names += [ + tsn + "__quantile__q_10", + tsn + "__quantile__q_70", + tsn + "__number_peaks__n_30", + tsn + "__value_count__value_inf", + tsn + "__value_count__value_-inf", + tsn + "__value_count__value_nan", + ] # Apply functions - feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] + feature_names += [ + tsn + "__ar_coefficient__k_20__coeff_4", + tsn + "__ar_coefficient__coeff_10__k_-1", + ] kind_to_fc_parameters = from_columns(feature_names) - self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()), - ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", - "ar_coefficient", "value_count"]) + self.assertCountEqual( + list(kind_to_fc_parameters[tsn].keys()), + [ + "sum_values", + "median", + "length", + "sample_entropy", + "quantile", + "number_peaks", + "ar_coefficient", + "value_count", + ], + ) self.assertIsNone(kind_to_fc_parameters[tsn]["sum_values"]) - self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], - [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) + self.assertEqual( + kind_to_fc_parameters[tsn]["ar_coefficient"], + [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}], + ) - self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], - [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}]) + self.assertEqual( + kind_to_fc_parameters[tsn]["value_count"], + [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}], + ) def test_from_column_correct_for_comprehensive_fc_parameters(self): fset = ComprehensiveFCParameters() - X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), - default_fc_parameters=fset, - column_id="id", column_value="value", - n_jobs=0) + X_org = extract_features( + pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), + default_fc_parameters=fset, + column_id="id", + column_value="value", + n_jobs=0, + ) inferred_fset = from_columns(X_org) - X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), - kind_to_fc_parameters=inferred_fset, - column_id="id", column_value="value", - n_jobs=0) + X_new = extract_features( + pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), + kind_to_fc_parameters=inferred_fset, + column_id="id", + column_value="value", + n_jobs=0, + ) assert_frame_equal(X_org.sort_index(), X_new.sort_index()) def test_from_columns_ignores_columns(self): tsn = "TEST_TIME_SERIES" - feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] + feature_names = [ + tsn + "__sum_values", + tsn + "__median", + tsn + "__length", + tsn + "__sample_entropy", + ] feature_names += ["THIS_COL_SHOULD_BE_IGNORED"] - kind_to_fc_parameters = from_columns(feature_names, columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED", - "THIS_AS_WELL"]) + kind_to_fc_parameters = from_columns( + feature_names, + columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED", "THIS_AS_WELL"], + ) - self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()), - ["sum_values", "median", "length", "sample_entropy"]) + self.assertCountEqual( + list(kind_to_fc_parameters[tsn].keys()), + ["sum_values", "median", "length", "sample_entropy"], + ) def test_default_calculates_all_features(self): """ @@ -90,14 +139,20 @@ def test_default_calculates_all_features(self): in tsfresh.feature_extraction.feature_calculators """ settings = ComprehensiveFCParameters() - all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() - if hasattr(func, "fctype") - and not hasattr(func, 'input_type')] + all_feature_calculators = [ + name + for name, func in feature_calculators.__dict__.items() + if hasattr(func, "fctype") and not hasattr(func, "input_type") + ] for calculator in all_feature_calculators: - self.assertIn(calculator, settings, - msg='Default ComprehensiveFCParameters object does not setup calculation of {}' - .format(calculator)) + self.assertIn( + calculator, + settings, + msg="Default ComprehensiveFCParameters object does not setup calculation of {}".format( + calculator + ), + ) def test_from_columns_correct_for_different_kind_datatypes(self): """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it @@ -106,16 +161,32 @@ def test_from_columns_correct_for_different_kind_datatypes(self): 'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the type of the top level keys (representing different kind values) will be str """ - df = pd.DataFrame({'id': [1, 1, 1, 1], - 'time': [1, 1, 2, 2], - 'kind': [1, 2, 1, 2], - 'value': [1, 2, 3, 4]}) - - features = extract_features(df, column_id='id', column_sort='time', column_kind='kind', column_value='value', - default_fc_parameters=MinimalFCParameters()) + df = pd.DataFrame( + { + "id": [1, 1, 1, 1], + "time": [1, 1, 2, 2], + "kind": [1, 2, 1, 2], + "value": [1, 2, 3, 4], + } + ) + + features = extract_features( + df, + column_id="id", + column_sort="time", + column_kind="kind", + column_value="value", + default_fc_parameters=MinimalFCParameters(), + ) sample_settings = from_columns(features) - X = extract_features(df, column_id='id', column_sort='time', column_kind='kind', column_value='value', - kind_to_fc_parameters=sample_settings) + X = extract_features( + df, + column_id="id", + column_sort="time", + column_kind="kind", + column_value="value", + kind_to_fc_parameters=sample_settings, + ) assert X.shape == (1, 2 * len(MinimalFCParameters())) @@ -126,11 +197,18 @@ class TestEfficientFCParameters(TestCase): def test_extraction_runs_through(self): rfs = EfficientFCParameters() - data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) - - extracted_features = extract_features(data, default_fc_parameters=rfs, - column_kind="kind", column_value="value", - column_sort="time", column_id="id") + data = pd.DataFrame( + [[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"] + ) + + extracted_features = extract_features( + data, + default_fc_parameters=rfs, + column_kind="kind", + column_value="value", + column_sort="time", + column_id="id", + ) self.assertCountEqual(extracted_features.index, [0, 1]) @@ -140,13 +218,20 @@ def test_contains_all_non_high_comp_cost_features(self): in tsfresh.feature_extraction.feature_calculators that do not have the attribute "high_comp_cost" """ rfs = EfficientFCParameters() - all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() - if hasattr(func, "fctype") and not hasattr(func, "high_comp_cost")] + all_feature_calculators = [ + name + for name, func in feature_calculators.__dict__.items() + if hasattr(func, "fctype") and not hasattr(func, "high_comp_cost") + ] for calculator in all_feature_calculators: - self.assertIn(calculator, rfs, - msg='Default EfficientFCParameters object does not setup calculation of {}' - .format(calculator)) + self.assertIn( + calculator, + rfs, + msg="Default EfficientFCParameters object does not setup calculation of {}".format( + calculator + ), + ) def test_contains_all_time_based_features(self): """ @@ -155,13 +240,20 @@ def test_contains_all_time_based_features(self): attribute "index_type" == pd.DatetimeIndex """ rfs = TimeBasedFCParameters() - all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() - if not getattr(func, "index_type", False) != pd.DatetimeIndex] + all_feature_calculators = [ + name + for name, func in feature_calculators.__dict__.items() + if not getattr(func, "index_type", False) != pd.DatetimeIndex + ] for calculator in all_feature_calculators: - self.assertIn(calculator, rfs, - msg='Default TimeBasedFCParameters object does not setup calculation of {}' - .format(calculator)) + self.assertIn( + calculator, + rfs, + msg="Default TimeBasedFCParameters object does not setup calculation of {}".format( + calculator + ), + ) def test_contains_all_index_based_features(self): """ @@ -170,13 +262,19 @@ def test_contains_all_index_based_features(self): attribute "input" == "pd.Series" """ rfs = IndexBasedFCParameters() - all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() - if getattr(func, "input", None) == "pd.Series"] + all_feature_calculators = [ + name + for name, func in feature_calculators.__dict__.items() + if getattr(func, "input", None) == "pd.Series" + ] for calculator in all_feature_calculators: - self.assertIn(calculator, rfs, - msg='Default IndexBasedFCParameters object does not setup calculation ' - 'of {}'.format(calculator)) + self.assertIn( + calculator, + rfs, + msg="Default IndexBasedFCParameters object does not setup calculation " + "of {}".format(calculator), + ) class TestMinimalSettingsObject(TestCase): @@ -196,15 +294,34 @@ def test_all_minimal_features_in(self): def test_extraction_runs_through(self): mfs = MinimalFCParameters() - data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) - - extracted_features = extract_features(data, default_fc_parameters=mfs, - column_kind="kind", column_value="value", - column_sort="time", column_id="id") - - self.assertCountEqual(extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values", - "0__maximum", "0__variance", "0__minimum", "0__mean", - "0__length", "0__root_mean_square", "0__absolute_maximum"]) + data = pd.DataFrame( + [[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"] + ) + + extracted_features = extract_features( + data, + default_fc_parameters=mfs, + column_kind="kind", + column_value="value", + column_sort="time", + column_id="id", + ) + + self.assertCountEqual( + extracted_features.columns, + [ + "0__median", + "0__standard_deviation", + "0__sum_values", + "0__maximum", + "0__variance", + "0__minimum", + "0__mean", + "0__length", + "0__root_mean_square", + "0__absolute_maximum", + ], + ) self.assertCountEqual(extracted_features.index, [0, 1]) @@ -213,7 +330,10 @@ def test_settings_pickable(self): settings = PickeableSettings() settings["test"] = 3 settings[lambda x: x + 1] = None - def f(x): return x - 2 + + def f(x): + return x - 2 + settings[f] = {"this": "is a test"} dumped_settings = pickle.dumps(settings) @@ -223,6 +343,8 @@ def f(x): return x - 2 self.assertEqual(len(settings), 3) for key in settings: - self.assertTrue(not callable(key) or - (key(3) == 4 and settings[key] is None) or - (key(3) == 1 and settings[key] == {"this": "is a test"})) + self.assertTrue( + not callable(key) + or (key(3) == 4 and settings[key] is None) + or (key(3) == 1 and settings[key] == {"this": "is a test"}) + ) diff --git a/tests/units/feature_selection/test_checks.py b/tests/units/feature_selection/test_checks.py index 80bc0d1a1..da4ace43d 100644 --- a/tests/units/feature_selection/test_checks.py +++ b/tests/units/feature_selection/test_checks.py @@ -7,8 +7,12 @@ import numpy as np from tsfresh.defaults import TEST_FOR_BINARY_TARGET_REAL_FEATURE -from tsfresh.feature_selection.significance_tests import target_real_feature_binary_test,\ - target_real_feature_real_test, target_binary_feature_real_test, target_binary_feature_binary_test +from tsfresh.feature_selection.significance_tests import ( + target_real_feature_binary_test, + target_real_feature_real_test, + target_binary_feature_real_test, + target_binary_feature_binary_test, +) from tests.fixtures import warning_free @@ -39,22 +43,31 @@ class TestChecksBinaryReal: def test_check_target_is_binary(self, real_series): with pytest.raises(ValueError): - target_binary_feature_real_test(x=real_series, y=real_series, - test=TEST_FOR_BINARY_TARGET_REAL_FEATURE) + target_binary_feature_real_test( + x=real_series, y=real_series, test=TEST_FOR_BINARY_TARGET_REAL_FEATURE + ) def test_checks_test_function(self, binary_series, real_series): with pytest.raises(ValueError): - target_binary_feature_real_test(x=real_series, y=binary_series, test="other_unknown_function") + target_binary_feature_real_test( + x=real_series, y=binary_series, test="other_unknown_function" + ) def test_checks_feature_nan(self, real_series_with_nan, binary_series): with pytest.raises(ValueError): - target_binary_feature_real_test(x=real_series_with_nan, y=binary_series, - test=TEST_FOR_BINARY_TARGET_REAL_FEATURE) + target_binary_feature_real_test( + x=real_series_with_nan, + y=binary_series, + test=TEST_FOR_BINARY_TARGET_REAL_FEATURE, + ) def test_checks_target_nan(self, binary_series_with_nan, real_series): with pytest.raises(ValueError): - target_binary_feature_real_test(x=real_series, y=binary_series_with_nan, - test=TEST_FOR_BINARY_TARGET_REAL_FEATURE) + target_binary_feature_real_test( + x=real_series, + y=binary_series_with_nan, + test=TEST_FOR_BINARY_TARGET_REAL_FEATURE, + ) def test_check_feature_is_series(self, binary_series, real_series): with pytest.raises(TypeError): diff --git a/tests/units/feature_selection/test_fdr_control.py b/tests/units/feature_selection/test_fdr_control.py index c82c5a97c..3e1b3d6ed 100644 --- a/tests/units/feature_selection/test_fdr_control.py +++ b/tests/units/feature_selection/test_fdr_control.py @@ -8,23 +8,34 @@ import numpy as np -@pytest.mark.parametrize("p_value, ind, fdr, expected", - [([0, 0, 0], True, 0.10, [True, True, True]), - ([0, 0, 0], False, 0.10, [True, True, True]), - ([0.1, 0.15, 0.2, 0], True, 0.20, [True, True, True, True]), - ([0.1, 0.15, 0.2, 0], False, 0.20, [False, False, False, True]), - ([0.1, 0.1, 0.05], True, 0.20, [True, True, True]), - ([0.1, 0.11, 0.05], False, 0.20, [False, False, False]), - ([0.1, 0.1, 0.05], False, 0.20, [True, True, True]), - ([.00356, .01042, .01208, .02155, .03329, .11542], True, 0.05, - [True, True, True, True, True, False]), - ([.00356, .01042, .01208, .02155, .03329, .11542], False, 0.05, - [False, False, False, False, False, False]), - ([0.11, 0.001, 0.05], False, 0.20, [False, True, True]), - ]) +@pytest.mark.parametrize( + "p_value, ind, fdr, expected", + [ + ([0, 0, 0], True, 0.10, [True, True, True]), + ([0, 0, 0], False, 0.10, [True, True, True]), + ([0.1, 0.15, 0.2, 0], True, 0.20, [True, True, True, True]), + ([0.1, 0.15, 0.2, 0], False, 0.20, [False, False, False, True]), + ([0.1, 0.1, 0.05], True, 0.20, [True, True, True]), + ([0.1, 0.11, 0.05], False, 0.20, [False, False, False]), + ([0.1, 0.1, 0.05], False, 0.20, [True, True, True]), + ( + [0.00356, 0.01042, 0.01208, 0.02155, 0.03329, 0.11542], + True, + 0.05, + [True, True, True, True, True, False], + ), + ( + [0.00356, 0.01042, 0.01208, 0.02155, 0.03329, 0.11542], + False, + 0.05, + [False, False, False, False, False, False], + ), + ([0.11, 0.001, 0.05], False, 0.20, [False, True, True]), + ], +) def test_fdr_control(p_value, ind, fdr, expected): df = pd.DataFrame({"p_value": p_value}) - method = 'fdr_bh' if ind else 'fdr_by' + method = "fdr_bh" if ind else "fdr_by" df["relevant"] = multipletests(pvals=df.p_value, alpha=fdr, method=method)[0] result = df["relevant"].values expected = np.array(expected) diff --git a/tests/units/feature_selection/test_feature_significance.py b/tests/units/feature_selection/test_feature_significance.py index ad2093927..0d013cc63 100644 --- a/tests/units/feature_selection/test_feature_significance.py +++ b/tests/units/feature_selection/test_feature_significance.py @@ -26,7 +26,9 @@ def test_binary_target_mixed_case(self): z[z == 2] = 1 X["rel1"] = z - X["rel2"] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 1, 1000) + X["rel2"] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal( + 0, 1, 1000 + ) X["rel3"] = y + np.random.normal(0, 0.3, 1000) X["rel4"] = y ** 2 + np.random.normal(0, 1, 1000) X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) @@ -48,7 +50,7 @@ def test_binary_target_mixed_case(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) + self.assertIn(kept_feature, ["rel1", "rel2", "rel3", "rel4", "rel5"]) self.assertGreater(len(feat_rej), 0) @@ -116,7 +118,7 @@ def test_binary_target_binary_features(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) + self.assertIn(kept_feature, ["rel1", "rel2", "rel3", "rel4", "rel5"]) self.assertGreater(len(feat_rej), 0) @@ -157,7 +159,7 @@ def test_binomial_target_realvalued_features(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) + self.assertIn(kept_feature, ["rel1", "rel2", "rel3", "rel4"]) self.assertGreater(len(feat_rej), 0) @@ -203,7 +205,7 @@ def test_real_target_mixed_case(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) + self.assertIn(kept_feature, ["rel1", "rel2", "rel3", "rel4"]) self.assertGreater(len(feat_rej), 0) @@ -253,7 +255,7 @@ def test_real_target_binary_features(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2']) + self.assertIn(kept_feature, ["rel1", "rel2"]) self.assertGreater(len(feat_rej), 0) @@ -277,7 +279,7 @@ def test_all_features_good(self): # Make sure all selected variables are relevant for kept_feature in feat_rej: - self.assertIn(kept_feature, ['rel1', 'rel2']) + self.assertIn(kept_feature, ["rel1", "rel2"]) self.assertGreater(len(feat_rej), 0) diff --git a/tests/units/feature_selection/test_relevance.py b/tests/units/feature_selection/test_relevance.py index f81a35b44..6a5ebd170 100644 --- a/tests/units/feature_selection/test_relevance.py +++ b/tests/units/feature_selection/test_relevance.py @@ -138,9 +138,11 @@ def test_warning_for_no_relevant_feature( X, y_real, n_jobs=0, ml_task="regression", show_warnings=True ) assert len(record) >= 1 - assert ("No feature was found relevant for regression for fdr level = 0.05 (which corresponds " - "to the maximal percentage of irrelevant features, consider using an higher fdr level " - "or add other features.") in [str(warning.message) for warning in record] + assert ( + "No feature was found relevant for regression for fdr level = 0.05 (which corresponds " + "to the maximal percentage of irrelevant features, consider using an higher fdr level " + "or add other features." + ) in [str(warning.message) for warning in record] def test_multiclass_requires_classification(self, X, y_real): with pytest.raises(AssertionError): diff --git a/tests/units/feature_selection/test_significance_tests.py b/tests/units/feature_selection/test_significance_tests.py index 950c8d6ea..a82d8b700 100644 --- a/tests/units/feature_selection/test_significance_tests.py +++ b/tests/units/feature_selection/test_significance_tests.py @@ -6,8 +6,12 @@ import pandas as pd from tsfresh.defaults import TEST_FOR_BINARY_TARGET_REAL_FEATURE -from tsfresh.feature_selection.significance_tests import target_binary_feature_binary_test, \ - target_binary_feature_real_test, target_real_feature_real_test, target_real_feature_binary_test +from tsfresh.feature_selection.significance_tests import ( + target_binary_feature_binary_test, + target_binary_feature_real_test, + target_real_feature_real_test, + target_real_feature_binary_test, +) @pytest.fixture() @@ -40,39 +44,56 @@ class TestUnsignificant: def minimal_p_value_for_unsignificant_features(self): return 0.05 - def test_feature_selection_target_binary_features_binary(self, minimal_p_value_for_unsignificant_features, - binary_feature, - binary_target_not_related): + def test_feature_selection_target_binary_features_binary( + self, + minimal_p_value_for_unsignificant_features, + binary_feature, + binary_target_not_related, + ): """ Test if the p_value returned by target_binary_feature_binary_test is large enough for highly unsignificant features. """ - p_value = target_binary_feature_binary_test(binary_feature, binary_target_not_related) + p_value = target_binary_feature_binary_test( + binary_feature, binary_target_not_related + ) assert minimal_p_value_for_unsignificant_features < p_value - def test_feature_selection_target_binary_features_realvalued(self, minimal_p_value_for_unsignificant_features, - real_feature, - binary_target_not_related): + def test_feature_selection_target_binary_features_realvalued( + self, + minimal_p_value_for_unsignificant_features, + real_feature, + binary_target_not_related, + ): """ Test if the p_value returned by target_binary_feature_binary_test is large enough for highly unsignificant features. """ - p_value = target_binary_feature_real_test(real_feature, binary_target_not_related, - TEST_FOR_BINARY_TARGET_REAL_FEATURE) + p_value = target_binary_feature_real_test( + real_feature, binary_target_not_related, TEST_FOR_BINARY_TARGET_REAL_FEATURE + ) assert minimal_p_value_for_unsignificant_features < p_value - def test_feature_selection_target_realvalued_features_binary(self, minimal_p_value_for_unsignificant_features, - binary_feature, - real_target_not_related): + def test_feature_selection_target_realvalued_features_binary( + self, + minimal_p_value_for_unsignificant_features, + binary_feature, + real_target_not_related, + ): """ Test if the p_value returned by target_real_feature_binary_test is large enough for highly unsignificant features.""" - p_value = target_real_feature_binary_test(binary_feature, real_target_not_related) + p_value = target_real_feature_binary_test( + binary_feature, real_target_not_related + ) assert minimal_p_value_for_unsignificant_features < p_value - def test_feature_selection_target_realvalued_features_realvalued(self, minimal_p_value_for_unsignificant_features, - real_feature, - real_target_not_related): + def test_feature_selection_target_realvalued_features_realvalued( + self, + minimal_p_value_for_unsignificant_features, + real_feature, + real_target_not_related, + ): """ Test if the p_value returned by target_real_feature_real_test is large enough for highly unsignificant features. @@ -86,13 +107,16 @@ class TestSignificant: def maximal_p_value_for_significant_features(self): return 0.15 - def test_feature_selection_target_binary_features_binary(self, maximal_p_value_for_significant_features, - binary_feature): + def test_feature_selection_target_binary_features_binary( + self, maximal_p_value_for_significant_features, binary_feature + ): """ Test if the p_value returned by target_binary_feature_binary_test is low enough for highly significant features. """ - y = binary_feature - pd.Series(np.random.binomial(1, 0.1, 250) + np.random.binomial(1, 0.1, 250)) + y = binary_feature - pd.Series( + np.random.binomial(1, 0.1, 250) + np.random.binomial(1, 0.1, 250) + ) y[y == -1] = 0 y[y == -2] = 0 y[y == 2] = 1 @@ -100,8 +124,9 @@ def test_feature_selection_target_binary_features_binary(self, maximal_p_value_f p_value = target_binary_feature_binary_test(binary_feature, y) assert maximal_p_value_for_significant_features > p_value - def test_feature_selection_target_binary_features_realvalued_mann(self, maximal_p_value_for_significant_features, - real_feature): + def test_feature_selection_target_binary_features_realvalued_mann( + self, maximal_p_value_for_significant_features, real_feature + ): """ Test if the p_value returned by target_binary_feature_real_test is low enough for highly significant features. @@ -113,11 +138,14 @@ def test_feature_selection_target_binary_features_realvalued_mann(self, maximal_ y[y == -1] = 0 y[y == 2] = 1 - p_value = target_binary_feature_real_test(real_feature, y, TEST_FOR_BINARY_TARGET_REAL_FEATURE) + p_value = target_binary_feature_real_test( + real_feature, y, TEST_FOR_BINARY_TARGET_REAL_FEATURE + ) assert maximal_p_value_for_significant_features > p_value - def test_feature_selection_target_binary_features_realvalued_smir(self, maximal_p_value_for_significant_features, - real_feature): + def test_feature_selection_target_binary_features_realvalued_smir( + self, maximal_p_value_for_significant_features, real_feature + ): """ Test if the p_value returned by target_binary_feature_real_test is low enough for highly significant features. @@ -132,19 +160,23 @@ def test_feature_selection_target_binary_features_realvalued_smir(self, maximal_ p_value = target_binary_feature_real_test(real_feature, y, test="smir") assert maximal_p_value_for_significant_features > p_value - def test_feature_selection_target_realvalued_features_binary(self, maximal_p_value_for_significant_features, - binary_feature): + def test_feature_selection_target_realvalued_features_binary( + self, maximal_p_value_for_significant_features, binary_feature + ): """ Test if the p_value returned by target_real_feature_binary_test is low enough for highly significant features. """ - y = binary_feature * pd.Series(np.random.normal(0, 1, 250)) + pd.Series(np.random.normal(0, 0.25, 250)) + y = binary_feature * pd.Series(np.random.normal(0, 1, 250)) + pd.Series( + np.random.normal(0, 0.25, 250) + ) p_value = target_real_feature_binary_test(binary_feature, y) assert maximal_p_value_for_significant_features > p_value - def test_feature_selection_target_realvalued_features_realvalued(self, maximal_p_value_for_significant_features, - real_feature): + def test_feature_selection_target_realvalued_features_realvalued( + self, maximal_p_value_for_significant_features, real_feature + ): """ Test if the p_value returned by target_real_feature_real_test is low enough for highly significant features. diff --git a/tests/units/scripts/test_run_tsfresh.py b/tests/units/scripts/test_run_tsfresh.py index b8d04bbb3..ee515a1e9 100644 --- a/tests/units/scripts/test_run_tsfresh.py +++ b/tests/units/scripts/test_run_tsfresh.py @@ -27,7 +27,10 @@ def extract_features_mock(df, **kwargs): return df # Patcher object to be disabled in tearDown - self.patcher = patch('tsfresh.scripts.run_tsfresh.extract_features', side_effect=extract_features_mock) + self.patcher = patch( + "tsfresh.scripts.run_tsfresh.extract_features", + side_effect=extract_features_mock, + ) # Mocked extract function self.mocked_extract_function = self.patcher.start() @@ -48,7 +51,9 @@ def call_main_function(self, input_csv_string=None, arguments=""): output_file_name = "temporary_output_csv_file.csv" arguments_with_filenames = "{input_file_name} {arguments} --output-file-name {output_file_name}".format( - input_file_name=input_file_name, arguments=arguments, output_file_name=output_file_name + input_file_name=input_file_name, + arguments=arguments, + output_file_name=output_file_name, ) run_tsfresh.main(arguments_with_filenames.split()) @@ -60,23 +65,28 @@ def call_main_function(self, input_csv_string=None, arguments=""): return None def test_invalid_arguments(self): - self.assertRaises(SystemExit, self.call_main_function, arguments="--invalid-argument") + self.assertRaises( + SystemExit, self.call_main_function, arguments="--invalid-argument" + ) def test_csv_without_headers_wrong_arguments(self): - self.assertRaises(AttributeError, self.call_main_function, - arguments="--column-id invalid") + self.assertRaises( + AttributeError, self.call_main_function, arguments="--column-id invalid" + ) def test_csv_without_headers(self): input_csv = "1 1 1 1\n1 1 1 1" - output_csv = ",id,time,value\n" \ - "0,0,0,1\n" \ - "1,0,1,1\n" \ - "2,0,2,1\n" \ - "3,0,3,1\n" \ - "4,1,0,1\n" \ - "5,1,1,1\n" \ - "6,1,2,1\n" \ - "7,1,3,1\n" + output_csv = ( + ",id,time,value\n" + "0,0,0,1\n" + "1,0,1,1\n" + "2,0,2,1\n" + "3,0,3,1\n" + "4,1,0,1\n" + "5,1,1,1\n" + "6,1,2,1\n" + "7,1,3,1\n" + ) result_csv = self.call_main_function(input_csv_string=input_csv) @@ -95,7 +105,8 @@ def test_csv_with_header(self): result_csv = self.call_main_function( input_csv_string=input_csv, - arguments="--column-id ID --column-sort SORT --column-value VALUE --column-kind KIND --csv-with-headers") + arguments="--column-id ID --column-sort SORT --column-value VALUE --column-kind KIND --csv-with-headers", + ) self.assertEqual(result_csv, output_csv) diff --git a/tests/units/transformers/test_feature_augmenter.py b/tests/units/transformers/test_feature_augmenter.py index bb1338c27..94d07b5a4 100644 --- a/tests/units/transformers/test_feature_augmenter.py +++ b/tests/units/transformers/test_feature_augmenter.py @@ -14,15 +14,21 @@ def setUp(self): self.test_df = self.create_test_data_sample() fc_parameters = {"length": None} - self.kind_to_fc_parameters = {"a": fc_parameters.copy(), - "b": fc_parameters.copy()} + self.kind_to_fc_parameters = { + "a": fc_parameters.copy(), + "b": fc_parameters.copy(), + } def test_fit_and_transform(self): - augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", - column_kind="kind", - kind_to_fc_parameters=self.kind_to_fc_parameters, - n_jobs=0, - disable_progressbar=True) + augmenter = FeatureAugmenter( + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + kind_to_fc_parameters=self.kind_to_fc_parameters, + n_jobs=0, + disable_progressbar=True, + ) # Fit should do nothing returned_df = augmenter.fit() @@ -45,7 +51,9 @@ def test_fit_and_transform(self): self.assertEqual(X_transformed.shape, (2, 3)) # Preserve old features - self.assertCountEqual(list(X_transformed.columns), ["feature_1", "a__length", "b__length"]) + self.assertCountEqual( + list(X_transformed.columns), ["feature_1", "a__length", "b__length"] + ) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): @@ -54,11 +62,15 @@ def test_fit_and_transform(self): self.assertFalse(np.isnan(row["b__length"])) def test_add_features_to_only_a_part(self): - augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", - column_kind="kind", - kind_to_fc_parameters=self.kind_to_fc_parameters, - n_jobs=0, - disable_progressbar=True) + augmenter = FeatureAugmenter( + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + kind_to_fc_parameters=self.kind_to_fc_parameters, + n_jobs=0, + disable_progressbar=True, + ) augmenter.set_timeseries_container(self.test_df) @@ -81,14 +93,22 @@ def test_add_features_to_only_a_part(self): self.assertFalse(np.isnan(row["b__length"])) def test_no_ids_present(self): - augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", - column_kind="kind", - kind_to_fc_parameters=self.kind_to_fc_parameters, - n_jobs=0, - disable_progressbar=True) + augmenter = FeatureAugmenter( + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + kind_to_fc_parameters=self.kind_to_fc_parameters, + n_jobs=0, + disable_progressbar=True, + ) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[-999]) - self.assertRaisesRegex(AttributeError, r"The ids of the time series container", - augmenter.transform, X_with_not_all_ids) + self.assertRaisesRegex( + AttributeError, + r"The ids of the time series container", + augmenter.transform, + X_with_not_all_ids, + ) diff --git a/tests/units/transformers/test_per_column_imputer.py b/tests/units/transformers/test_per_column_imputer.py index 599646e06..54bbb1c4e 100644 --- a/tests/units/transformers/test_per_column_imputer.py +++ b/tests/units/transformers/test_per_column_imputer.py @@ -38,8 +38,10 @@ def test_only_nans_and_infs(self): with warnings.catch_warnings(record=True) as w: imputer.fit(X) self.assertEqual(len(w), 1) - self.assertEqual("The columns ['NaNs' 'PINF' 'NINF'] did not have any finite values. Filling with zeros.", - str(w[0].message)) + self.assertEqual( + "The columns ['NaNs' 'PINF' 'NINF'] did not have any finite values. Filling with zeros.", + str(w[0].message), + ) selected_X = imputer.transform(X) @@ -59,8 +61,10 @@ def test_with_numpy_array(self): with warnings.catch_warnings(record=True) as w: imputer.fit(X) self.assertEqual(len(w), 1) - self.assertEqual("The columns ['NaNs' 'PINF' 'NINF'] did not have any finite values. Filling with zeros.", - str(w[0].message)) + self.assertEqual( + "The columns ['NaNs' 'PINF' 'NINF'] did not have any finite values. Filling with zeros.", + str(w[0].message), + ) selected_X = imputer.transform(X) @@ -69,8 +73,10 @@ def test_with_numpy_array(self): with warnings.catch_warnings(record=True) as w: imputer.fit(X_numpy) self.assertEqual(len(w), 1) - self.assertEqual("The columns [0 1 2] did not have any finite values. Filling with zeros.", - str(w[0].message)) + self.assertEqual( + "The columns [0 1 2] did not have any finite values. Filling with zeros.", + str(w[0].message), + ) selected_X_numpy = imputer.transform(X_numpy) diff --git a/tests/units/transformers/test_relevant_feature_augmenter.py b/tests/units/transformers/test_relevant_feature_augmenter.py index 1c3656c9c..d8d36f8a8 100644 --- a/tests/units/transformers/test_relevant_feature_augmenter.py +++ b/tests/units/transformers/test_relevant_feature_augmenter.py @@ -20,8 +20,10 @@ class RelevantFeatureAugmenterTestCase(DataTestCase): def setUp(self): self.test_df = self.create_test_data_sample() fc_parameters = {"length": None} - self.kind_to_fc_parameters = {"a": fc_parameters.copy(), - "b": fc_parameters.copy()} + self.kind_to_fc_parameters = { + "a": fc_parameters.copy(), + "b": fc_parameters.copy(), + } def test_not_fitted(self): augmenter = RelevantFeatureAugmenter() @@ -40,9 +42,13 @@ def test_no_timeseries(self): self.assertRaises(RuntimeError, augmenter.fit_transform, X, y) def test_nothing_relevant(self): - augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters, - column_value="val", column_id="id", column_sort="sort", - column_kind="kind") + augmenter = RelevantFeatureAugmenter( + kind_to_fc_parameters=self.kind_to_fc_parameters, + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + ) y = pd.Series({10: 1, 500: 0}) X = pd.DataFrame(index=[10, 500]) @@ -64,9 +70,14 @@ def test_filter_only_tsfresh_features_true(self): filtered. This unit tests checks that """ - augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters, - filter_only_tsfresh_features=True, - column_value="val", column_id="id", column_sort="sort", column_kind="kind") + augmenter = RelevantFeatureAugmenter( + kind_to_fc_parameters=self.kind_to_fc_parameters, + filter_only_tsfresh_features=True, + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + ) y = pd.Series({10: 1, 500: 0}) X = pd.DataFrame(index=[10, 500]) @@ -78,8 +89,12 @@ def test_filter_only_tsfresh_features_true(self): fit_transformed_X = augmenter.fit_transform(X, y) - self.assertEqual(sum(["pre_feature" == column for column in transformed_X.columns]), 1) - self.assertEqual(sum(["pre_feature" == column for column in fit_transformed_X.columns]), 1) + self.assertEqual( + sum(["pre_feature" == column for column in transformed_X.columns]), 1 + ) + self.assertEqual( + sum(["pre_feature" == column for column in fit_transformed_X.columns]), 1 + ) def test_filter_only_tsfresh_features_false(self): """ @@ -87,9 +102,14 @@ def test_filter_only_tsfresh_features_false(self): filtered. This unit tests checks that """ - augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters, - filter_only_tsfresh_features=False, - column_value="val", column_id="id", column_sort="sort", column_kind="kind") + augmenter = RelevantFeatureAugmenter( + kind_to_fc_parameters=self.kind_to_fc_parameters, + filter_only_tsfresh_features=False, + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + ) df, y = self.create_test_data_sample_with_target() X = pd.DataFrame(index=np.unique(df.id)) @@ -102,20 +122,32 @@ def test_filter_only_tsfresh_features_false(self): fit_transformed_X = augmenter.fit_transform(X, y) - self.assertEqual(sum(["pre_keep" == column for column in transformed_X.columns]), 1) - self.assertEqual(sum(["pre_drop" == column for column in transformed_X.columns]), 0) - self.assertEqual(sum(["pre_keep" == column for column in fit_transformed_X.columns]), 1) - self.assertEqual(sum(["pre_drop" == column for column in fit_transformed_X.columns]), 0) + self.assertEqual( + sum(["pre_keep" == column for column in transformed_X.columns]), 1 + ) + self.assertEqual( + sum(["pre_drop" == column for column in transformed_X.columns]), 0 + ) + self.assertEqual( + sum(["pre_keep" == column for column in fit_transformed_X.columns]), 1 + ) + self.assertEqual( + sum(["pre_drop" == column for column in fit_transformed_X.columns]), 0 + ) - @mock.patch('tsfresh.transformers.feature_selector.calculate_relevance_table') + @mock.patch("tsfresh.transformers.feature_selector.calculate_relevance_table") def test_does_impute(self, calculate_relevance_table_mock): - df = pd.DataFrame([[1, 1, 1], [2, 1, 1]], columns=['id', 'time', 'value']) + df = pd.DataFrame([[1, 1, 1], [2, 1, 1]], columns=["id", "time", "value"]) X = pd.DataFrame(index=[1]) y = pd.Series([0, 1]) - fc_parameters = {"autocorrelation": [{'lag': 2}]} + fc_parameters = {"autocorrelation": [{"lag": 2}]} - calculate_relevance_table_mock.return_value = pd.DataFrame(columns=['feature', 'p_value', 'relevant']) - augmenter = RelevantFeatureAugmenter(column_id='id', column_sort='time', default_fc_parameters=fc_parameters) + calculate_relevance_table_mock.return_value = pd.DataFrame( + columns=["feature", "p_value", "relevant"] + ) + augmenter = RelevantFeatureAugmenter( + column_id="id", column_sort="time", default_fc_parameters=fc_parameters + ) augmenter.set_timeseries_container(df) with warning_free(): augmenter.fit(X, y) @@ -124,19 +156,34 @@ def test_does_impute(self, calculate_relevance_table_mock): assert not calculate_relevance_table_mock.call_args[0][0].isnull().any().any() def test_no_ids_present(self): - augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters, - filter_only_tsfresh_features=False, - column_value="val", column_id="id", column_sort="sort", column_kind="kind") + augmenter = RelevantFeatureAugmenter( + kind_to_fc_parameters=self.kind_to_fc_parameters, + filter_only_tsfresh_features=False, + column_value="val", + column_id="id", + column_sort="sort", + column_kind="kind", + ) df, y = self.create_test_data_sample_with_target() X_with_wrong_ids = pd.DataFrame(index=[-999]) augmenter.set_timeseries_container(df) - self.assertRaisesRegex(AttributeError, r"The ids of the time series container", - augmenter.fit, X_with_wrong_ids, y) - self.assertRaisesRegex(AttributeError, r"The ids of the time series container", - augmenter.fit_transform, X_with_wrong_ids, y) + self.assertRaisesRegex( + AttributeError, + r"The ids of the time series container", + augmenter.fit, + X_with_wrong_ids, + y, + ) + self.assertRaisesRegex( + AttributeError, + r"The ids of the time series container", + augmenter.fit_transform, + X_with_wrong_ids, + y, + ) def test_multiclass_selection(self): augmenter = RelevantFeatureAugmenter( @@ -163,16 +210,33 @@ def test_relevant_augmentor_cross_validated(): """ n = 16 # number of samples, needs to be divisable by 4 index = range(n) - df_ts = pd.DataFrame({"time": [10, 11] * n, "id": np.repeat(index, 2), - "value": [0, 1] * (n // 4) + [1, 2] * (n // 4) + # class 0 - [10, 11] * (n // 4) + [12, 14] * (n // 4)}) + df_ts = pd.DataFrame( + { + "time": [10, 11] * n, + "id": np.repeat(index, 2), + "value": [0, 1] * (n // 4) + + [1, 2] * (n // 4) + + [10, 11] * (n // 4) # class 0 + + [12, 14] * (n // 4), + } + ) y = pd.Series(data=[0] * (n // 2) + [1] * (n // 2), index=index) X = pd.DataFrame(index=index) - augmenter = RelevantFeatureAugmenter(column_id='id', column_sort='time', timeseries_container=df_ts, - default_fc_parameters=MinimalFCParameters(), - disable_progressbar=True, show_warnings=False, fdr_level=0.90) - pipeline = Pipeline([('augmenter', augmenter), - ('classifier', RandomForestClassifier(random_state=1))]) + augmenter = RelevantFeatureAugmenter( + column_id="id", + column_sort="time", + timeseries_container=df_ts, + default_fc_parameters=MinimalFCParameters(), + disable_progressbar=True, + show_warnings=False, + fdr_level=0.90, + ) + pipeline = Pipeline( + [ + ("augmenter", augmenter), + ("classifier", RandomForestClassifier(random_state=1)), + ] + ) scores = model_selection.cross_val_score(pipeline, X, y, cv=2) assert (scores == np.array([1, 1])).all() diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 875accc06..cc386c3c0 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -16,67 +16,142 @@ class RollingTestCase(TestCase): def test_with_wrong_input(self): - test_df = pd.DataFrame({"id": [0, 0], "kind": ["a", "b"], "value": [3, 3], "sort": [np.NaN, np.NaN]}) - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_sort="sort", column_kind="kind", - rolling_direction=1, n_jobs=0) - - test_df = pd.DataFrame({"id": [0, 0], "kind": ["a", "b"], "value": [3, 3], "sort": [1, 1]}) - self.assertRaises(AttributeError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="strange_id", - column_sort="sort", column_kind="kind", - rolling_direction=1, n_jobs=0) - - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id=None, - column_sort="sort", column_kind="kind", - rolling_direction=1, n_jobs=0) + test_df = pd.DataFrame( + { + "id": [0, 0], + "kind": ["a", "b"], + "value": [3, 3], + "sort": [np.NaN, np.NaN], + } + ) + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_sort="sort", + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) + + test_df = pd.DataFrame( + {"id": [0, 0], "kind": ["a", "b"], "value": [3, 3], "sort": [1, 1]} + ) + self.assertRaises( + AttributeError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="strange_id", + column_sort="sort", + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) + + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id=None, + column_sort="sort", + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) test_df = {"a": pd.DataFrame([{"id": 0}])} - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_sort=None, column_kind="kind", - rolling_direction=1, n_jobs=0) - - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id=None, - column_sort=None, column_kind="kind", - rolling_direction=1, n_jobs=0) - - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_sort=None, column_kind=None, - rolling_direction=0, n_jobs=0) - - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id=None, - column_sort=None, column_kind=None, - rolling_direction=0, n_jobs=0) - - test_df = pd.DataFrame({"id": [0, 0], "kind": ["a", "b"], "value": [3, 3], "sort": [1, 1]}) - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_kind="kind", column_sort="sort", - max_timeshift=0, - rolling_direction=1, n_jobs=0) - - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_kind="kind", column_sort="sort", - min_timeshift=-1, - rolling_direction=1, n_jobs=0) + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_sort=None, + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) + + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id=None, + column_sort=None, + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) + + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_sort=None, + column_kind=None, + rolling_direction=0, + n_jobs=0, + ) + + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id=None, + column_sort=None, + column_kind=None, + rolling_direction=0, + n_jobs=0, + ) + + test_df = pd.DataFrame( + {"id": [0, 0], "kind": ["a", "b"], "value": [3, 3], "sort": [1, 1]} + ) + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_kind="kind", + column_sort="sort", + max_timeshift=0, + rolling_direction=1, + n_jobs=0, + ) + + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_kind="kind", + column_sort="sort", + min_timeshift=-1, + rolling_direction=1, + n_jobs=0, + ) def test_assert_single_row(self): test_df = pd.DataFrame([{"id": np.NaN, "kind": "a", "value": 3, "sort": 1}]) - self.assertRaises(ValueError, dataframe_functions.roll_time_series, - df_or_dict=test_df, column_id="id", - column_sort="sort", column_kind="kind", - rolling_direction=1, n_jobs=0) + self.assertRaises( + ValueError, + dataframe_functions.roll_time_series, + df_or_dict=test_df, + column_id="id", + column_sort="sort", + column_kind="kind", + rolling_direction=1, + n_jobs=0, + ) def test_positive_rolling(self): - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": range(20, 22)} + ) first_class["id"] = 1 second_class["id"] = 2 @@ -105,29 +180,75 @@ def test_positive_rolling(self): (1, 3), (2, 20), (2, 21), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 1.0, + 2.0, + 1.0, + 2.0, + 3.0, + 1.0, + 2.0, + 3.0, + 4.0, + 10.0, + 10.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 5.0, + 6.0, + 5.0, + 6.0, + 7.0, + 5.0, + 6.0, + 7.0, + 8.0, + 12.0, + 12.0, + 13.0, ] - correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] - correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + n_jobs=0, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, - max_timeshift=4, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + max_timeshift=4, + n_jobs=0, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, - max_timeshift=2, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + max_timeshift=2, + n_jobs=0, + ) correct_indices = [ (1, 0), (1, 1), @@ -140,27 +261,53 @@ def test_positive_rolling(self): (1, 3), (2, 20), (2, 21), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 1.0, + 2.0, + 1.0, + 2.0, + 3.0, + 2.0, + 3.0, + 4.0, + 10.0, + 10.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 5.0, + 6.0, + 5.0, + 6.0, + 7.0, + 6.0, + 7.0, + 8.0, + 12.0, + 12.0, + 13.0, ] - correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] - correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, - max_timeshift=2, min_timeshift=2, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + max_timeshift=2, + min_timeshift=2, + n_jobs=0, + ) - correct_indices = [ - (1, 2), - (1, 2), - (1, 2), - (1, 3), - (1, 3), - (1, 3) - ] + correct_indices = [(1, 2), (1, 2), (1, 2), (1, 3), (1, 3), (1, 3)] correct_values_a = [1.0, 2.0, 3.0, 2.0, 3.0, 4.0] correct_values_b = [5.0, 6.0, 7.0, 6.0, 7.0, 8.0] @@ -169,8 +316,12 @@ def test_positive_rolling(self): self.assertListEqual(list(df["b"].values), correct_values_b) def test_negative_rolling(self): - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": range(20, 22)} + ) first_class["id"] = 1 second_class["id"] = 2 @@ -199,29 +350,75 @@ def test_negative_rolling(self): (1, 3), (2, 20), (2, 20), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 2.0, + 3.0, + 4.0, + 2.0, + 3.0, + 4.0, + 3.0, + 4.0, + 4.0, + 10.0, + 11.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 6.0, + 7.0, + 8.0, + 6.0, + 7.0, + 8.0, + 7.0, + 8.0, + 8.0, + 12.0, + 13.0, + 13.0, ] - correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] - correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + n_jobs=0, + ) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, - max_timeshift=None, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + max_timeshift=None, + n_jobs=0, + ) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, - max_timeshift=1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + max_timeshift=1, + n_jobs=0, + ) correct_indices = [ (1, 0), @@ -242,9 +439,15 @@ def test_negative_rolling(self): self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, - max_timeshift=2, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + max_timeshift=2, + n_jobs=0, + ) correct_indices = [ (1, 0), @@ -258,18 +461,50 @@ def test_negative_rolling(self): (1, 3), (2, 20), (2, 20), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 2.0, + 3.0, + 2.0, + 3.0, + 4.0, + 3.0, + 4.0, + 4.0, + 10.0, + 11.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 6.0, + 7.0, + 6.0, + 7.0, + 8.0, + 7.0, + 8.0, + 8.0, + 12.0, + 13.0, + 13.0, ] - correct_values_a = [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] - correct_values_b = [5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, - max_timeshift=4, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + max_timeshift=4, + n_jobs=0, + ) correct_indices = [ (1, 0), @@ -284,28 +519,55 @@ def test_negative_rolling(self): (1, 3), (2, 20), (2, 20), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 2.0, + 3.0, + 4.0, + 2.0, + 3.0, + 4.0, + 3.0, + 4.0, + 4.0, + 10.0, + 11.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 6.0, + 7.0, + 8.0, + 6.0, + 7.0, + 8.0, + 7.0, + 8.0, + 8.0, + 12.0, + 13.0, + 13.0, ] - correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] - correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-1, - min_timeshift=2, max_timeshift=3, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-1, + min_timeshift=2, + max_timeshift=3, + n_jobs=0, + ) - correct_indices = [ - (1, 0), - (1, 0), - (1, 0), - (1, 0), - (1, 1), - (1, 1), - (1, 1) - ] + correct_indices = [(1, 0), (1, 0), (1, 0), (1, 0), (1, 1), (1, 1), (1, 1)] correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0] @@ -314,8 +576,12 @@ def test_negative_rolling(self): self.assertListEqual(list(df["b"].values), correct_values_b) def test_rolling_with_larger_shift(self): - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": range(20, 22)} + ) first_class["id"] = 1 second_class["id"] = 2 @@ -339,13 +605,19 @@ def test_rolling_with_larger_shift(self): (1, 3), (1, 3), (2, 21), - (2, 21) + (2, 21), ] correct_values_a = [1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 10.0, 11.0] correct_values_b = [5.0, 6.0, 5.0, 6.0, 7.0, 8.0, 12.0, 13.0] - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=2, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=2, + n_jobs=0, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) @@ -359,29 +631,44 @@ def test_rolling_with_larger_shift(self): (1, 2), (1, 2), (2, 20), - (2, 20) + (2, 20), ] correct_values_a = [1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 10.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 7.0, 8.0, 12.0, 13.0] - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=-2, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=-2, + n_jobs=0, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) def test_stacked_rolling(self): - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": range(20, 22)} + ) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) - df_stacked = pd.concat([df_full[["time", "id", "a"]].rename(columns={"a": "_value"}), - df_full[["time", "id", "b"]].rename(columns={"b": "_value"})], ignore_index=True) + df_stacked = pd.concat( + [ + df_full[["time", "id", "a"]].rename(columns={"a": "_value"}), + df_full[["time", "id", "b"]].rename(columns={"b": "_value"}), + ], + ignore_index=True, + ) df_stacked["kind"] = ["a"] * 6 + ["b"] * 6 """ df_stacked is @@ -400,31 +687,75 @@ def test_stacked_rolling(self): 11 21 2 13 b """ - df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time", - column_kind="kind", rolling_direction=-1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_stacked, + column_id="id", + column_sort="time", + column_kind="kind", + rolling_direction=-1, + n_jobs=0, + ) correct_indices = ( - [(1, 0)] * 2 * 4 + - [(1, 1)] * 2 * 3 + - [(1, 2)] * 2 * 2 + - [(1, 3)] * 2 * 1 + - [(2, 20)] * 2 * 2 + - [(2, 21)] * 2 * 1 + [(1, 0)] * 2 * 4 + + [(1, 1)] * 2 * 3 + + [(1, 2)] * 2 * 2 + + [(1, 3)] * 2 * 1 + + [(2, 20)] * 2 * 2 + + [(2, 21)] * 2 * 1 ) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13) - self.assertListEqual(list(df["_value"].values), - [1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4., 8., 4., 8., 10., 12., - 11., 13., 11., 13.]) + self.assertListEqual( + list(df["_value"].values), + [ + 1.0, + 5.0, + 2.0, + 6.0, + 3.0, + 7.0, + 4.0, + 8.0, + 2.0, + 6.0, + 3.0, + 7.0, + 4.0, + 8.0, + 3.0, + 7.0, + 4.0, + 8.0, + 4.0, + 8.0, + 10.0, + 12.0, + 11.0, + 13.0, + 11.0, + 13.0, + ], + ) def test_dict_rolling(self): df_dict = { - "a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]}) + "a": pd.DataFrame( + {"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]} + ), + "b": pd.DataFrame( + {"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]} + ), } - df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, - rolling_direction=-1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_dict, + column_id="id", + column_sort=None, + column_kind=None, + rolling_direction=-1, + n_jobs=0, + ) """ df is {a: _value id 1.0 1 @@ -457,24 +788,39 @@ def test_dict_rolling(self): (1, 3), (2, 0), (2, 0), - (2, 1) + (2, 1), ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) - self.assertListEqual(list(df["a"]["_value"].values), - [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0]) - self.assertListEqual(list(df["b"]["_value"].values), - [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0]) + self.assertListEqual( + list(df["a"]["_value"].values), + [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0], + ) + self.assertListEqual( + list(df["b"]["_value"].values), + [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0], + ) def test_dict_rolling_maxshift_1(self): df_dict = { - "a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]}) + "a": pd.DataFrame( + {"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]} + ), + "b": pd.DataFrame( + {"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]} + ), } - df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, - rolling_direction=-1, max_timeshift=1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_dict, + column_id="id", + column_sort=None, + column_kind=None, + rolling_direction=-1, + max_timeshift=1, + n_jobs=0, + ) """ df is {a: _value id 1.0 1 @@ -504,14 +850,20 @@ def test_dict_rolling_maxshift_1(self): (1, 3), (2, 0), (2, 0), - (2, 1) + (2, 1), ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) - self.assertListEqual(list(df["a"]["_value"].values), [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0]) - self.assertListEqual(list(df["b"]["_value"].values), [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0]) + self.assertListEqual( + list(df["a"]["_value"].values), + [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0], + ) + self.assertListEqual( + list(df["b"]["_value"].values), + [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0], + ) def test_order_rolling(self): @@ -527,8 +879,12 @@ def test_order_rolling(self): window_size = 2 df_rolled = dataframe_functions.roll_time_series( - df_full, column_id="initial_id", column_sort="time", - min_timeshift=window_size-1, max_timeshift=window_size-1) + df_full, + column_id="initial_id", + column_sort="time", + min_timeshift=window_size - 1, + max_timeshift=window_size - 1, + ) """ df is {x: _value id @@ -552,33 +908,48 @@ def test_order_rolling(self): (2, 133), (2, 133), (2, 146), - (2, 146) + (2, 146), ] self.assertListEqual(list(df_rolled["id"]), correct_indices) def test_warning_on_non_uniform_time_steps(self): with warnings.catch_warnings(record=True) as w: - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": list(range(20, 22))}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": list(range(20, 22))} + ) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) - dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, n_jobs=0) + dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + n_jobs=0, + ) self.assertGreaterEqual(len(w), 1) - self.assertIn("Your time stamps are not uniformly sampled, which makes rolling " - "nonsensical in some domains.", - [str(warning.message) for warning in w] - ) + self.assertIn( + "Your time stamps are not uniformly sampled, which makes rolling " + "nonsensical in some domains.", + [str(warning.message) for warning in w], + ) def test_multicore_rolling(self): - first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + first_class = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)} + ) + second_class = pd.DataFrame( + {"a": [10, 11], "b": [12, 13], "time": range(20, 22)} + ) first_class["id"] = 1 second_class["id"] = 2 @@ -607,20 +978,59 @@ def test_multicore_rolling(self): (1, 3), (2, 20), (2, 21), - (2, 21) + (2, 21), + ] + correct_values_a = [ + 1.0, + 1.0, + 2.0, + 1.0, + 2.0, + 3.0, + 1.0, + 2.0, + 3.0, + 4.0, + 10.0, + 10.0, + 11.0, + ] + correct_values_b = [ + 5.0, + 5.0, + 6.0, + 5.0, + 6.0, + 7.0, + 5.0, + 6.0, + 7.0, + 8.0, + 12.0, + 12.0, + 13.0, ] - correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] - correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", - column_kind=None, rolling_direction=1, n_jobs=0) + df = dataframe_functions.roll_time_series( + df_full, + column_id="id", + column_sort="time", + column_kind=None, + rolling_direction=1, + n_jobs=0, + ) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) @@ -636,16 +1046,36 @@ def test_all_columns(self): test_df = pd.DataFrame([[1, 2, 3], [4, np.NaN, 6]], index=[0, 1]) - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df) + self.assertRaises( + ValueError, dataframe_functions.check_for_nans_in_columns, test_df + ) def test_not_all_columns(self): - test_df = pd.DataFrame([[1, 2, 3], [4, np.NaN, 6]], index=[0, 1], columns=["a", "b", "c"]) + test_df = pd.DataFrame( + [[1, 2, 3], [4, np.NaN, 6]], index=[0, 1], columns=["a", "b", "c"] + ) - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df) - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df, ["a", "b"]) - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df, ["b"]) - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df, "b") - self.assertRaises(ValueError, dataframe_functions.check_for_nans_in_columns, test_df, ["c", "b"]) + self.assertRaises( + ValueError, dataframe_functions.check_for_nans_in_columns, test_df + ) + self.assertRaises( + ValueError, + dataframe_functions.check_for_nans_in_columns, + test_df, + ["a", "b"], + ) + self.assertRaises( + ValueError, dataframe_functions.check_for_nans_in_columns, test_df, ["b"] + ) + self.assertRaises( + ValueError, dataframe_functions.check_for_nans_in_columns, test_df, "b" + ) + self.assertRaises( + ValueError, + dataframe_functions.check_for_nans_in_columns, + test_df, + ["c", "b"], + ) dataframe_functions.check_for_nans_in_columns(test_df, columns=["a", "c"]) dataframe_functions.check_for_nans_in_columns(test_df, columns="a") @@ -665,16 +1095,22 @@ def test_impute_zero(self): dataframe_functions.impute_dataframe_zero(df) self.assertEqual(list(df.value), [0]) - df = pd.DataFrame([{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}]) + df = pd.DataFrame( + [{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}] + ) dataframe_functions.impute_dataframe_zero(df) self.assertEqual(list(df.value), [0, 0, 0, 1]) - df = pd.DataFrame([{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}]) + df = pd.DataFrame( + [{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}] + ) df = df.astype(np.float64) df = dataframe_functions.impute_dataframe_zero(df) self.assertEqual(list(df.value), [0, 0, 0, 1]) - df = pd.DataFrame([{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}]) + df = pd.DataFrame( + [{"value": np.NINF}, {"value": np.NaN}, {"value": np.PINF}, {"value": 1}] + ) df = df.astype(np.float32) df = dataframe_functions.impute_dataframe_zero(df) self.assertEqual(list(df.value), [0, 0, 0, 1]) @@ -685,8 +1121,10 @@ def test_impute_zero(self): self.assertEqual(len(df), 0) def test_toplevel_impute(self): - df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), - columns=["value_a", "value_b", "value_c"]) + df = pd.DataFrame( + np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), + columns=["value_a", "value_b", "value_c"], + ) dataframe_functions.impute(df) @@ -694,8 +1132,12 @@ def test_toplevel_impute(self): self.assertEqual(list(df.value_b), [1, 3, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -3, 3]) - df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, np.NaN], [np.NaN, -3, np.NINF, 3]]), - columns=["value_a", "value_b", "value_c"]) + df = pd.DataFrame( + np.transpose( + [[0, 1, 2, np.NaN], [1, np.PINF, 2, np.NaN], [np.NaN, -3, np.NINF, 3]] + ), + columns=["value_a", "value_b", "value_c"], + ) df = df.astype(np.float64) dataframe_functions.impute(df) @@ -703,8 +1145,12 @@ def test_toplevel_impute(self): self.assertEqual(list(df.value_b), [1, 2, 2, 1.5]) self.assertEqual(list(df.value_c), [0, -3, -3, 3]) - df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [np.PINF, -3, np.NINF, 3]]), - columns=["value_a", "value_b", "value_c"]) + df = pd.DataFrame( + np.transpose( + [[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [np.PINF, -3, np.NINF, 3]] + ), + columns=["value_a", "value_b", "value_c"], + ) df = df.astype(np.float32) dataframe_functions.impute(df) @@ -719,17 +1165,21 @@ def test_toplevel_impute(self): def test_impute_range(self): def get_df(): - return pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], - [1, np.PINF, 2, 3], - [1, -3, np.NINF, 3]]), - columns=["value_a", "value_b", "value_c"]) + return pd.DataFrame( + np.transpose( + [[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]] + ), + columns=["value_a", "value_b", "value_c"], + ) # check if values are replaced correctly df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} - dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) + dataframe_functions.impute_dataframe_range( + df, col_to_max, col_to_min, col_to_median + ) self.assertEqual(list(df.value_a), [0, 1, 2, 55]) self.assertEqual(list(df.value_b), [1, 200, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -134, 3]) @@ -739,35 +1189,61 @@ def get_df(): col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_c": 55} - self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, - df, col_to_max, col_to_min, col_to_median) + self.assertRaises( + ValueError, + dataframe_functions.impute_dataframe_range, + df, + col_to_max, + col_to_min, + col_to_median, + ) # check for no error if column key is too much col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55, "value_d": 55} - dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) + dataframe_functions.impute_dataframe_range( + df, col_to_max, col_to_min, col_to_median + ) # check for error if replacement value is not finite df = get_df() col_to_max = {"value_a": 200, "value_b": np.NaN, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} - self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, - df, col_to_max, col_to_min, col_to_median) + self.assertRaises( + ValueError, + dataframe_functions.impute_dataframe_range, + df, + col_to_max, + col_to_min, + col_to_median, + ) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": np.NINF, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} - self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, - df, col_to_max, col_to_min, col_to_median) + self.assertRaises( + ValueError, + dataframe_functions.impute_dataframe_range, + df, + col_to_max, + col_to_min, + col_to_median, + ) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": np.PINF} - self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, - df, col_to_max, col_to_min, col_to_median) + self.assertRaises( + ValueError, + dataframe_functions.impute_dataframe_range, + df, + col_to_max, + col_to_min, + col_to_median, + ) df = pd.DataFrame([0, 1, 2, 3, 4], columns=["test"]) col_dict = {"test": 0} @@ -784,36 +1260,55 @@ def get_df(): class RestrictTestCase(TestCase): def test_restrict_dataframe(self): - df = pd.DataFrame({'id': [1, 2, 3] * 2}) + df = pd.DataFrame({"id": [1, 2, 3] * 2}) - df_restricted = dataframe_functions.restrict_input_to_index(df, 'id', [2]) + df_restricted = dataframe_functions.restrict_input_to_index(df, "id", [2]) self.assertEqual(list(df_restricted.id), [2, 2]) - df_restricted2 = dataframe_functions.restrict_input_to_index(df, 'id', [1, 2, 3]) + df_restricted2 = dataframe_functions.restrict_input_to_index( + df, "id", [1, 2, 3] + ) self.assertTrue(df_restricted2.equals(df)) def test_restrict_dict(self): - kind_to_df = {'a': pd.DataFrame({'id': [1, 2, 3]}), 'b': pd.DataFrame({'id': [3, 4, 5]})} + kind_to_df = { + "a": pd.DataFrame({"id": [1, 2, 3]}), + "b": pd.DataFrame({"id": [3, 4, 5]}), + } - kind_to_df_restricted = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [3]) - self.assertEqual(list(kind_to_df_restricted['a'].id), [3]) - self.assertEqual(list(kind_to_df_restricted['b'].id), [3]) + kind_to_df_restricted = dataframe_functions.restrict_input_to_index( + kind_to_df, "id", [3] + ) + self.assertEqual(list(kind_to_df_restricted["a"].id), [3]) + self.assertEqual(list(kind_to_df_restricted["b"].id), [3]) - kind_to_df_restricted2 = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [1, 2, 3, 4, 5]) - self.assertTrue(kind_to_df_restricted2['a'].equals(kind_to_df['a'])) - self.assertTrue(kind_to_df_restricted2['b'].equals(kind_to_df['b'])) + kind_to_df_restricted2 = dataframe_functions.restrict_input_to_index( + kind_to_df, "id", [1, 2, 3, 4, 5] + ) + self.assertTrue(kind_to_df_restricted2["a"].equals(kind_to_df["a"])) + self.assertTrue(kind_to_df_restricted2["b"].equals(kind_to_df["b"])) def test_restrict_wrong(self): other_type = np.array([1, 2, 3]) - self.assertRaises(TypeError, dataframe_functions.restrict_input_to_index, other_type, "id", [1, 2, 3]) + self.assertRaises( + TypeError, + dataframe_functions.restrict_input_to_index, + other_type, + "id", + [1, 2, 3], + ) class GetRangeValuesPerColumnTestCase(TestCase): def test_ignores_non_finite_values(self): df = pd.DataFrame([0, 1, 2, 3, np.NaN, np.PINF, np.NINF], columns=["value"]) - col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) + ( + col_to_max, + col_to_min, + col_to_median, + ) = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 3}) self.assertEqual(col_to_min, {"value": 0}) @@ -822,7 +1317,11 @@ def test_ignores_non_finite_values(self): def test_range_values_correct_with_even_length(self): df = pd.DataFrame([0, 1, 2, 3], columns=["value"]) - col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) + ( + col_to_max, + col_to_min, + col_to_median, + ) = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 3}) self.assertEqual(col_to_min, {"value": 0}) @@ -831,7 +1330,11 @@ def test_range_values_correct_with_even_length(self): def test_range_values_correct_with_uneven_length(self): df = pd.DataFrame([0, 1, 2], columns=["value"]) - col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) + ( + col_to_max, + col_to_min, + col_to_median, + ) = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 2}) self.assertEqual(col_to_min, {"value": 0}) @@ -841,11 +1344,17 @@ def test_no_finite_values_yields_0(self): df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"]) with warnings.catch_warnings(record=True) as w: - col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) + ( + col_to_max, + col_to_min, + col_to_median, + ) = dataframe_functions.get_range_values_per_column(df) self.assertEqual(len(w), 1) - self.assertEqual(str(w[0].message), - "The columns ['value'] did not have any finite values. Filling with zeros.") + self.assertEqual( + str(w[0].message), + "The columns ['value'] did not have any finite values. Filling with zeros.", + ) self.assertEqual(col_to_max, {"value": 0}) self.assertEqual(col_to_min, {"value": 0}) @@ -853,66 +1362,127 @@ def test_no_finite_values_yields_0(self): class MakeForecastingFrameTestCase(TestCase): - def test_make_forecasting_frame_list(self): - df, y = dataframe_functions.make_forecasting_frame(x=range(4), kind="test", - max_timeshift=1, rolling_direction=1) - expected_df = pd.DataFrame({"id": [("id", 1), ("id", 2), ("id", 3)], - "kind": ["test"] * 3, - "value": [0, 1, 2], - "time": [0, 1, 2]}) - - expected_y = pd.Series(data=[1, 2, 3], index=[("id", 1), ("id", 2), ("id", 3)], name="value") - assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) + df, y = dataframe_functions.make_forecasting_frame( + x=range(4), kind="test", max_timeshift=1, rolling_direction=1 + ) + expected_df = pd.DataFrame( + { + "id": [("id", 1), ("id", 2), ("id", 3)], + "kind": ["test"] * 3, + "value": [0, 1, 2], + "time": [0, 1, 2], + } + ) + + expected_y = pd.Series( + data=[1, 2, 3], index=[("id", 1), ("id", 2), ("id", 3)], name="value" + ) + assert_frame_equal( + df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1) + ) assert_series_equal(y, expected_y) def test_make_forecasting_frame_range(self): - df, y = dataframe_functions.make_forecasting_frame(x=np.arange(4), kind="test", - max_timeshift=1, rolling_direction=1) - expected_df = pd.DataFrame({"id": list(zip(["id"] * 3, np.arange(1, 4))), - "kind": ["test"] * 3, - "value": np.arange(3), - "time": [0, 1, 2]}) - expected_y = pd.Series(data=[1, 2, 3], index=[("id", 1), ("id", 2), ("id", 3)], name="value") - assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) + df, y = dataframe_functions.make_forecasting_frame( + x=np.arange(4), kind="test", max_timeshift=1, rolling_direction=1 + ) + expected_df = pd.DataFrame( + { + "id": list(zip(["id"] * 3, np.arange(1, 4))), + "kind": ["test"] * 3, + "value": np.arange(3), + "time": [0, 1, 2], + } + ) + expected_y = pd.Series( + data=[1, 2, 3], index=[("id", 1), ("id", 2), ("id", 3)], name="value" + ) + assert_frame_equal( + df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1) + ) assert_series_equal(y, expected_y) def test_make_forecasting_frame_pdSeries(self): - t_index = pd.date_range('1/1/2011', periods=4, freq='H') - df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index), - kind="test", max_timeshift=1, rolling_direction=1) - - time_shifts = pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00", "2011-01-01 03:00:00"], freq="H") - expected_y = pd.Series(data=[1, 2, 3], index=zip(["id"]*3, time_shifts), name="value") - expected_df = pd.DataFrame({"id": list(zip(["id"] * 3, pd.DatetimeIndex(["2011-01-01 01:00:00", - "2011-01-01 02:00:00", - "2011-01-01 03:00:00"]))), - "kind": ["test"] * 3, "value": [0, 1, 2], - "time": pd.DatetimeIndex(["2011-01-01 00:00:00", "2011-01-01 01:00:00", - "2011-01-01 02:00:00"]) - }) - assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) + t_index = pd.date_range("1/1/2011", periods=4, freq="H") + df, y = dataframe_functions.make_forecasting_frame( + x=pd.Series(data=range(4), index=t_index), + kind="test", + max_timeshift=1, + rolling_direction=1, + ) + + time_shifts = pd.DatetimeIndex( + ["2011-01-01 01:00:00", "2011-01-01 02:00:00", "2011-01-01 03:00:00"], + freq="H", + ) + expected_y = pd.Series( + data=[1, 2, 3], index=zip(["id"] * 3, time_shifts), name="value" + ) + expected_df = pd.DataFrame( + { + "id": list( + zip( + ["id"] * 3, + pd.DatetimeIndex( + [ + "2011-01-01 01:00:00", + "2011-01-01 02:00:00", + "2011-01-01 03:00:00", + ] + ), + ) + ), + "kind": ["test"] * 3, + "value": [0, 1, 2], + "time": pd.DatetimeIndex( + [ + "2011-01-01 00:00:00", + "2011-01-01 01:00:00", + "2011-01-01 02:00:00", + ] + ), + } + ) + assert_frame_equal( + df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1) + ) assert_series_equal(y, expected_y) def test_make_forecasting_frame_feature_extraction(self): - t_index = pd.date_range('1/1/2011', periods=4, freq='H') - df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index), - kind="test", max_timeshift=1, rolling_direction=1) + t_index = pd.date_range("1/1/2011", periods=4, freq="H") + df, y = dataframe_functions.make_forecasting_frame( + x=pd.Series(data=range(4), index=t_index), + kind="test", + max_timeshift=1, + rolling_direction=1, + ) - extract_relevant_features(df, y, column_id="id", column_sort="time", column_value="value", - default_fc_parameters=MinimalFCParameters()) + extract_relevant_features( + df, + y, + column_id="id", + column_sort="time", + column_value="value", + default_fc_parameters=MinimalFCParameters(), + ) class GetIDsTestCase(TestCase): - def test_get_id__correct_DataFrame(self): df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}) self.assertEqual(dataframe_functions.get_ids(df, "id"), {1, 2}) def test_get_id__correct_dict(self): - df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), - "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} + df_dict = { + "a": pd.DataFrame( + {"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]} + ), + "b": pd.DataFrame( + {"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]} + ), + } self.assertEqual(dataframe_functions.get_ids(df_dict, "id"), {1, 2, 3, 4}) def test_get_id_wrong(self): @@ -929,60 +1499,77 @@ def test_no_parameters(self): assert_series_equal(dataframe["value"], extended_dataframe["value"]) def test_id_parameters(self): - dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]}) + dataframe = pd.DataFrame( + {"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]} + ) - extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, column_id="id") + extended_dataframe = dataframe_functions.add_sub_time_series_index( + dataframe, 2, column_id="id" + ) - self.assertEqual(list(extended_dataframe["id"]), - [(0, 1), (0, 1), (1, 1), (1, 1), (0, 2), (0, 2), (1, 2), (1, 2), (2, 2)]) + self.assertEqual( + list(extended_dataframe["id"]), + [(0, 1), (0, 1), (1, 1), (1, 1), (0, 2), (0, 2), (1, 2), (1, 2), (2, 2)], + ) assert_series_equal(dataframe["value"], extended_dataframe["value"]) def test_kind_parameters(self): - dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], - "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0]}) + dataframe = pd.DataFrame( + { + "value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0], + } + ) - extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, - column_id="id", - column_kind="kind") + extended_dataframe = dataframe_functions.add_sub_time_series_index( + dataframe, 2, column_id="id", column_kind="kind" + ) - self.assertEqual(list(extended_dataframe["id"]), - [(0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2), (1, 2)]) + self.assertEqual( + list(extended_dataframe["id"]), + [(0, 1), (0, 1), (0, 1), (0, 1), (0, 2), (0, 2), (0, 2), (0, 2), (1, 2)], + ) assert_series_equal(dataframe["value"], extended_dataframe["value"]) assert_series_equal(dataframe["kind"], extended_dataframe["kind"]) def test_sort_parameters(self): - dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], - "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0], - "sort": [9, 8, 7, 6, 5, 4, 3, 2, 1]}) - - extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, - column_id="id", - column_kind="kind", - column_sort="sort") - - self.assertEqual(list(extended_dataframe["id"]), - [(0, 2), (0, 2), (0, 2), (0, 2), (1, 2), (0, 1), (0, 1), (0, 1), (0, 1)]) - self.assertEqual(list(extended_dataframe["value"]), - [9, 8, 7, 6, 5, 4, 3, 2, 1]) - self.assertEqual(list(extended_dataframe["kind"]), - [0, 1, 0, 1, 0, 1, 0, 1, 0]) - self.assertEqual(list(extended_dataframe["sort"]), - [1, 2, 3, 4, 5, 6, 7, 8, 9]) + dataframe = pd.DataFrame( + { + "value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0], + "sort": [9, 8, 7, 6, 5, 4, 3, 2, 1], + } + ) + + extended_dataframe = dataframe_functions.add_sub_time_series_index( + dataframe, 2, column_id="id", column_kind="kind", column_sort="sort" + ) + + self.assertEqual( + list(extended_dataframe["id"]), + [(0, 2), (0, 2), (0, 2), (0, 2), (1, 2), (0, 1), (0, 1), (0, 1), (0, 1)], + ) + self.assertEqual(list(extended_dataframe["value"]), [9, 8, 7, 6, 5, 4, 3, 2, 1]) + self.assertEqual(list(extended_dataframe["kind"]), [0, 1, 0, 1, 0, 1, 0, 1, 0]) + self.assertEqual(list(extended_dataframe["sort"]), [1, 2, 3, 4, 5, 6, 7, 8, 9]) def test_dict_input(self): - dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]}) + dataframe = pd.DataFrame( + {"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]} + ) - extended_dataframe = dataframe_functions.add_sub_time_series_index({"1": dataframe}, 2, - column_id="id") + extended_dataframe = dataframe_functions.add_sub_time_series_index( + {"1": dataframe}, 2, column_id="id" + ) self.assertIn("1", extended_dataframe) extended_dataframe = extended_dataframe["1"] - self.assertEqual(list(extended_dataframe["id"]), - [(0, 1), (0, 1), (1, 1), (1, 1), (0, 2), (0, 2), (1, 2), (1, 2), (2, 2)]) + self.assertEqual( + list(extended_dataframe["id"]), + [(0, 1), (0, 1), (1, 1), (1, 1), (0, 2), (0, 2), (1, 2), (1, 2), (2, 2)], + ) assert_series_equal(dataframe["value"], extended_dataframe["value"]) diff --git a/tests/units/utilities/test_distribution.py b/tests/units/utilities/test_distribution.py index 5b1f6c796..3ddaf0518 100644 --- a/tests/units/utilities/test_distribution.py +++ b/tests/units/utilities/test_distribution.py @@ -9,12 +9,15 @@ from distributed import LocalCluster, Client from tsfresh import extract_features -from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor, ClusterDaskDistributor +from tsfresh.utilities.distribution import ( + MultiprocessingDistributor, + LocalDaskDistributor, + ClusterDaskDistributor, +) from tests.fixtures import DataTestCase class MultiprocessingDistributorTestCase(TestCase): - def test_partition(self): distributor = MultiprocessingDistributor(n_workers=1) @@ -44,23 +47,35 @@ def test__calculate_best_chunk_size(self): class LocalDaskDistributorTestCase(DataTestCase): - def test_local_dask_cluster_extraction_one_worker(self): Distributor = LocalDaskDistributor(n_workers=1) df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", - column_value="val", - distributor=Distributor) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + distributor=Distributor, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) @@ -69,17 +84,30 @@ def test_local_dask_cluster_extraction_two_worker(self): Distributor = LocalDaskDistributor(n_workers=2) df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", - column_value="val", - distributor=Distributor) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + distributor=Distributor, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) @@ -87,46 +115,76 @@ def test_local_dask_cluster_extraction_two_worker(self): class ClusterDaskDistributorTestCase(DataTestCase): @skipIf(sys.version_info < (3, 6, 0), "dask requires python >= 3.6") def test_dask_cluster_extraction_one_worker(self): - cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None) + cluster = LocalCluster( + n_workers=1, threads_per_worker=1, dashboard_address=None + ) client = Client(cluster) - address = client.scheduler_info()['address'] + address = client.scheduler_info()["address"] Distributor = ClusterDaskDistributor(address=address) df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", - column_value="val", - distributor=Distributor) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + distributor=Distributor, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) cluster.close() @skipIf(sys.version_info < (3, 6, 0), "dask requires python >= 3.6") def test_dask_cluster_extraction_two_workers(self): - cluster = LocalCluster(n_workers=2, threads_per_worker=1, dashboard_address=None) + cluster = LocalCluster( + n_workers=2, threads_per_worker=1, dashboard_address=None + ) client = Client(cluster) - address = client.scheduler_info()['address'] + address = client.scheduler_info()["address"] Distributor = ClusterDaskDistributor(address=address) df = self.create_test_data_sample() - extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", - column_value="val", - distributor=Distributor) + extracted_features = extract_features( + df, + column_id="id", + column_sort="sort", + column_kind="kind", + column_value="val", + distributor=Distributor, + ) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) - self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) - self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) - self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue( + np.all(extracted_features.a__sum_values == np.array([691, 1017])) + ) + self.assertTrue( + np.all(extracted_features.a__abs_energy == np.array([32211, 63167])) + ) + self.assertTrue( + np.all(extracted_features.b__sum_values == np.array([757, 695])) + ) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) - self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue( + np.all(extracted_features.b__abs_energy == np.array([36619, 35483])) + ) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) cluster.close() diff --git a/tests/units/utilities/test_string_manipilations.py b/tests/units/utilities/test_string_manipilations.py index 2a0b85665..b5b1ae5c4 100644 --- a/tests/units/utilities/test_string_manipilations.py +++ b/tests/units/utilities/test_string_manipilations.py @@ -9,14 +9,13 @@ class StringUtilities(TestCase): - def test_convert_to_output_format(self): out = convert_to_output_format({"p1": 1, "p2": "a"}) expected_out = 'p1_1__p2_"a"' self.assertEqual(out, expected_out) out = convert_to_output_format({"list": [1, 2, 4]}) - expected_out = 'list_[1, 2, 4]' + expected_out = "list_[1, 2, 4]" self.assertEqual(out, expected_out) out = convert_to_output_format({"list": ["a", "b", "c"]}) @@ -29,5 +28,5 @@ def test_convert_to_output_format_wrong_order(self): self.assertEqual(out, expected_out) out = convert_to_output_format({"c": 1, "b": 2, "a": 3}) - expected_out = 'a_3__b_2__c_1' + expected_out = "a_3__b_2__c_1" self.assertEqual(out, expected_out) diff --git a/tsfresh/__init__.py b/tsfresh/__init__.py index 267266882..38b5b9a89 100644 --- a/tsfresh/__init__.py +++ b/tsfresh/__init__.py @@ -15,7 +15,7 @@ try: __version__ = pkg_resources.get_distribution(__name__).version except Exception: - __version__ = 'unknown' + __version__ = "unknown" # Set default logging handler to avoid "No handler found" warnings. @@ -25,6 +25,8 @@ logging.getLogger(__name__).addHandler(NullHandler()) -from tsfresh.convenience.relevant_extraction import extract_relevant_features # noqa: E402 +from tsfresh.convenience.relevant_extraction import ( + extract_relevant_features, +) # noqa: E402 from tsfresh.feature_extraction import extract_features # noqa: E402 from tsfresh.feature_selection import select_features # noqa: E402 diff --git a/tsfresh/convenience/bindings.py b/tsfresh/convenience/bindings.py index 4ca1132fe..4e28144ba 100644 --- a/tsfresh/convenience/bindings.py +++ b/tsfresh/convenience/bindings.py @@ -6,9 +6,15 @@ import pandas as pd -def _feature_extraction_on_chunk_helper(df, column_id, column_kind, - column_sort, column_value, - default_fc_parameters, kind_to_fc_parameters): +def _feature_extraction_on_chunk_helper( + df, + column_id, + column_kind, + column_sort, + column_value, + default_fc_parameters, + kind_to_fc_parameters, +): """ Helper function wrapped around _do_extraction_on_chunk to use the correct format of the "chunk" and output a pandas dataframe. @@ -25,17 +31,26 @@ def _feature_extraction_on_chunk_helper(df, column_id, column_kind, df = df.sort_values(column_sort) chunk = df[column_id].iloc[0], df[column_kind].iloc[0], df[column_value] - features = _do_extraction_on_chunk(chunk, default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters) + features = _do_extraction_on_chunk( + chunk, + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + ) features = pd.DataFrame(features, columns=[column_id, "variable", "value"]) features["value"] = features["value"].astype("double") return features[[column_id, "variable", "value"]] -def dask_feature_extraction_on_chunk(df, column_id, column_kind, - column_value, column_sort=None, - default_fc_parameters=None, kind_to_fc_parameters=None): +def dask_feature_extraction_on_chunk( + df, + column_id, + column_kind, + column_value, + column_sort=None, + default_fc_parameters=None, + kind_to_fc_parameters=None, +): """ Extract features on a grouped dask dataframe given the column names and the extraction settings. This wrapper function should only be used if you have a dask dataframe as input. @@ -115,17 +130,30 @@ def dask_feature_extraction_on_chunk(df, column_id, column_kind, :rtype: dask.dataframe.DataFrame (id int64, variable object, value float64) """ - feature_extraction = partial(_feature_extraction_on_chunk_helper, - column_id=column_id, column_kind=column_kind, - column_sort=column_sort, column_value=column_value, - default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters) - return df.apply(feature_extraction, meta=[(column_id, 'int64'), ('variable', 'object'), ('value', 'float64')]) - - -def spark_feature_extraction_on_chunk(df, column_id, column_kind, - column_value, column_sort=None, - default_fc_parameters=None, kind_to_fc_parameters=None): + feature_extraction = partial( + _feature_extraction_on_chunk_helper, + column_id=column_id, + column_kind=column_kind, + column_sort=column_sort, + column_value=column_value, + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + ) + return df.apply( + feature_extraction, + meta=[(column_id, "int64"), ("variable", "object"), ("value", "float64")], + ) + + +def spark_feature_extraction_on_chunk( + df, + column_id, + column_kind, + column_value, + column_sort=None, + default_fc_parameters=None, + kind_to_fc_parameters=None, +): """ Extract features on a grouped spark dataframe given the column names and the extraction settings. This wrapper function should only be used if you have a spark dataframe as input. @@ -201,14 +229,21 @@ def spark_feature_extraction_on_chunk(df, column_id, column_kind, """ from pyspark.sql.functions import pandas_udf, PandasUDFType - feature_extraction = partial(_feature_extraction_on_chunk_helper, - column_id=column_id, column_kind=column_kind, - column_sort=column_sort, column_value=column_value, - default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters) - - type_string = "{column_id} long, variable string, value double".format(column_id=column_id) - feature_extraction_udf = pandas_udf(type_string, - PandasUDFType.GROUPED_MAP)(feature_extraction) + feature_extraction = partial( + _feature_extraction_on_chunk_helper, + column_id=column_id, + column_kind=column_kind, + column_sort=column_sort, + column_value=column_value, + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + ) + + type_string = "{column_id} long, variable string, value double".format( + column_id=column_id + ) + feature_extraction_udf = pandas_udf(type_string, PandasUDFType.GROUPED_MAP)( + feature_extraction + ) return df.apply(feature_extraction_udf) diff --git a/tsfresh/convenience/relevant_extraction.py b/tsfresh/convenience/relevant_extraction.py index c8d03b7f4..e78105f1f 100644 --- a/tsfresh/convenience/relevant_extraction.py +++ b/tsfresh/convenience/relevant_extraction.py @@ -6,28 +6,39 @@ from tsfresh.feature_extraction import extract_features from tsfresh import defaults from tsfresh.feature_selection import select_features -from tsfresh.utilities.dataframe_functions import restrict_input_to_index, impute, get_ids - - -def extract_relevant_features(timeseries_container, y, X=None, - default_fc_parameters=None, - kind_to_fc_parameters=None, - column_id=None, column_sort=None, column_kind=None, column_value=None, - show_warnings=defaults.SHOW_WARNINGS, - disable_progressbar=defaults.DISABLE_PROGRESSBAR, - profile=defaults.PROFILING, - profiling_filename=defaults.PROFILING_FILENAME, - profiling_sorting=defaults.PROFILING_SORTING, - test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE, - test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE, - test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE, - test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE, - fdr_level=defaults.FDR_LEVEL, - hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, - n_jobs=defaults.N_PROCESSES, - distributor=None, - chunksize=defaults.CHUNKSIZE, - ml_task='auto'): +from tsfresh.utilities.dataframe_functions import ( + restrict_input_to_index, + impute, + get_ids, +) + + +def extract_relevant_features( + timeseries_container, + y, + X=None, + default_fc_parameters=None, + kind_to_fc_parameters=None, + column_id=None, + column_sort=None, + column_kind=None, + column_value=None, + show_warnings=defaults.SHOW_WARNINGS, + disable_progressbar=defaults.DISABLE_PROGRESSBAR, + profile=defaults.PROFILING, + profiling_filename=defaults.PROFILING_FILENAME, + profiling_sorting=defaults.PROFILING_SORTING, + test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE, + test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE, + test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE, + test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE, + fdr_level=defaults.FDR_LEVEL, + hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, + n_jobs=defaults.N_PROCESSES, + distributor=None, + chunksize=defaults.CHUNKSIZE, + ml_task="auto", +): """ High level convenience function to extract time series features from `timeseries_container`. Then return feature matrix `X` possibly augmented with relevant features with respect to target vector `y`. @@ -141,46 +152,64 @@ def extract_relevant_features(timeseries_container, y, X=None, :return: Feature matrix X, possibly extended with relevant time series features. """ - assert isinstance(y, pd.Series), "y needs to be a pandas.Series, received type: {}.".format(type(y)) - assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided" + assert isinstance( + y, pd.Series + ), "y needs to be a pandas.Series, received type: {}.".format(type(y)) + assert ( + len(set(y)) > 1 + ), "Feature selection is only possible if more than 1 label/class is provided" if X is not None: - timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index) + timeseries_container = restrict_input_to_index( + timeseries_container, column_id, X.index + ) ids_container = get_ids(df_or_dict=timeseries_container, column_id=column_id) ids_y = set(y.index) if ids_container != ids_y: if len(ids_container - ids_y) > 0: - raise ValueError("The following ids are in the time series container but are missing in y: " - "{}".format(ids_container - ids_y)) + raise ValueError( + "The following ids are in the time series container but are missing in y: " + "{}".format(ids_container - ids_y) + ) if len(ids_y - ids_container) > 0: - raise ValueError("The following ids are in y but are missing inside the time series container: " - "{}".format(ids_y - ids_container)) - - X_ext = extract_features(timeseries_container, - default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters, - show_warnings=show_warnings, - disable_progressbar=disable_progressbar, - profile=profile, - profiling_filename=profiling_filename, - profiling_sorting=profiling_sorting, - n_jobs=n_jobs, - column_id=column_id, column_sort=column_sort, - column_kind=column_kind, column_value=column_value, - distributor=distributor, - impute_function=impute) - - X_sel = select_features(X_ext, y, - test_for_binary_target_binary_feature=test_for_binary_target_binary_feature, - test_for_binary_target_real_feature=test_for_binary_target_real_feature, - test_for_real_target_binary_feature=test_for_real_target_binary_feature, - test_for_real_target_real_feature=test_for_real_target_real_feature, - fdr_level=fdr_level, hypotheses_independent=hypotheses_independent, - n_jobs=n_jobs, - show_warnings=show_warnings, - chunksize=chunksize, - ml_task=ml_task) + raise ValueError( + "The following ids are in y but are missing inside the time series container: " + "{}".format(ids_y - ids_container) + ) + + X_ext = extract_features( + timeseries_container, + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + show_warnings=show_warnings, + disable_progressbar=disable_progressbar, + profile=profile, + profiling_filename=profiling_filename, + profiling_sorting=profiling_sorting, + n_jobs=n_jobs, + column_id=column_id, + column_sort=column_sort, + column_kind=column_kind, + column_value=column_value, + distributor=distributor, + impute_function=impute, + ) + + X_sel = select_features( + X_ext, + y, + test_for_binary_target_binary_feature=test_for_binary_target_binary_feature, + test_for_binary_target_real_feature=test_for_binary_target_real_feature, + test_for_real_target_binary_feature=test_for_real_target_binary_feature, + test_for_real_target_real_feature=test_for_real_target_real_feature, + fdr_level=fdr_level, + hypotheses_independent=hypotheses_independent, + n_jobs=n_jobs, + show_warnings=show_warnings, + chunksize=chunksize, + ml_task=ml_task, + ) if X is None: X = X_sel diff --git a/tsfresh/examples/__init__.py b/tsfresh/examples/__init__.py index 11873a241..f7cb9db83 100644 --- a/tsfresh/examples/__init__.py +++ b/tsfresh/examples/__init__.py @@ -3,6 +3,9 @@ See for eample the :ref:`quick-start-label` section on how to use them. """ -from .robot_execution_failures import load_robot_execution_failures, download_robot_execution_failures +from .robot_execution_failures import ( + load_robot_execution_failures, + download_robot_execution_failures, +) from .har_dataset import download_har_dataset, load_har_classes, load_har_dataset from .driftbif_simulation import load_driftbif diff --git a/tsfresh/examples/driftbif_simulation.py b/tsfresh/examples/driftbif_simulation.py index a5a61e90a..42a77f95f 100644 --- a/tsfresh/examples/driftbif_simulation.py +++ b/tsfresh/examples/driftbif_simulation.py @@ -163,8 +163,10 @@ def load_driftbif(n, length, m=2, classification=True, kappa_3=0.3, seed=False): # todo: add ratio of classes if m > 2: - logging.warning("You set the dimension parameter for the dissipative soliton to m={}, however it is only" - "properly defined for m=1 or m=2.".format(m)) + logging.warning( + "You set the dimension parameter for the dissipative soliton to m={}, however it is only" + "properly defined for m=1 or m=2.".format(m) + ) id = np.repeat(range(n), length * m) dimensions = list(np.repeat(range(m), length)) * n @@ -183,7 +185,14 @@ def load_driftbif(n, length, m=2, classification=True, kappa_3=0.3, seed=False): values.append(ds.simulate(length, v0=np.zeros(m)).transpose().flatten()) time = np.stack([ds.delta_t * np.arange(length)] * n * m).flatten() - df = pd.DataFrame({'id': id, "time": time, "value": np.stack(values).flatten(), "dimension": dimensions}) + df = pd.DataFrame( + { + "id": id, + "time": time, + "value": np.stack(values).flatten(), + "dimension": dimensions, + } + ) y = pd.Series(labels) y.index = range(n) diff --git a/tsfresh/examples/har_dataset.py b/tsfresh/examples/har_dataset.py index b7a47c8da..6d24a8cf4 100644 --- a/tsfresh/examples/har_dataset.py +++ b/tsfresh/examples/har_dataset.py @@ -29,7 +29,7 @@ _logger = logging.getLogger(__name__) module_path = os.path.dirname(__file__) -data_file_name = os.path.join(module_path, 'data', 'UCI HAR Dataset') +data_file_name = os.path.join(module_path, "data", "UCI HAR Dataset") def download_har_dataset(folder_name=data_file_name): @@ -43,14 +43,16 @@ def download_har_dataset(folder_name=data_file_name): >>> har_dataset.download_har_dataset() """ - zipurl = 'https://github.com/MaxBenChrist/human-activity-dataset/blob/master/UCI%20HAR%20Dataset.zip?raw=True' + zipurl = "https://github.com/MaxBenChrist/human-activity-dataset/blob/master/UCI%20HAR%20Dataset.zip?raw=True" if not os.access(module_path, os.W_OK): - raise RuntimeError("You don't have the necessary permissions to download the Human Activity Dataset " - "Set into the module path. Consider installing the module in a virtualenv you " - "own or run this function with appropriate permissions.") + raise RuntimeError( + "You don't have the necessary permissions to download the Human Activity Dataset " + "Set into the module path. Consider installing the module in a virtualenv you " + "own or run this function with appropriate permissions." + ) - if os.path.exists(os.path.join(folder_name, 'UCI HAR Dataset')): + if os.path.exists(os.path.join(folder_name, "UCI HAR Dataset")): _logger.warning("You have already downloaded the Human Activity Data Set.") return @@ -58,27 +60,42 @@ def download_har_dataset(folder_name=data_file_name): r = requests.get(zipurl, stream=True) if r.status_code != 200: - raise RuntimeError("Could not download the Human Activity Data Set from GitHub." - "HTTP status code: {}".format(r.status_code)) + raise RuntimeError( + "Could not download the Human Activity Data Set from GitHub." + "HTTP status code: {}".format(r.status_code) + ) with ZipFile(BytesIO(r.content)) as zfile: zfile.extractall(path=folder_name) def load_har_dataset(folder_name=data_file_name): - data_file_name_dataset = os.path.join(folder_name, 'UCI HAR Dataset', 'train', 'Inertial Signals', - 'body_acc_x_train.txt') + data_file_name_dataset = os.path.join( + folder_name, + "UCI HAR Dataset", + "train", + "Inertial Signals", + "body_acc_x_train.txt", + ) try: return pd.read_csv(data_file_name_dataset, delim_whitespace=True, header=None) except OSError: - raise OSError('File {} was not found. Have you downloaded the dataset with download_har_dataset() ' - 'before?'.format(data_file_name_dataset)) + raise OSError( + "File {} was not found. Have you downloaded the dataset with download_har_dataset() " + "before?".format(data_file_name_dataset) + ) def load_har_classes(folder_name=data_file_name): - data_file_name_classes = os.path.join(folder_name, 'UCI HAR Dataset', 'train', 'y_train.txt') + data_file_name_classes = os.path.join( + folder_name, "UCI HAR Dataset", "train", "y_train.txt" + ) try: - return pd.read_csv(data_file_name_classes, delim_whitespace=True, header=None, squeeze=True) + return pd.read_csv( + data_file_name_classes, delim_whitespace=True, header=None, squeeze=True + ) except OSError: - raise OSError('File {} was not found. Have you downloaded the dataset with download_har_dataset() ' - 'before?'.format(data_file_name_classes)) + raise OSError( + "File {} was not found. Have you downloaded the dataset with download_har_dataset() " + "before?".format(data_file_name_classes) + ) diff --git a/tsfresh/examples/robot_execution_failures.py b/tsfresh/examples/robot_execution_failures.py index c16f5b60d..ceaeca58b 100644 --- a/tsfresh/examples/robot_execution_failures.py +++ b/tsfresh/examples/robot_execution_failures.py @@ -29,13 +29,15 @@ _logger = logging.getLogger(__name__) -UCI_MLD_REF_MSG = ("The example data could not be found. You need to download the Robot Execution Failures " - "LP1 Data Set from the UCI Machine Learning Repository. To do so, you can call the function " - "tsfresh.examples.robot_execution_failures.download_robot_execution_failures") +UCI_MLD_REF_MSG = ( + "The example data could not be found. You need to download the Robot Execution Failures " + "LP1 Data Set from the UCI Machine Learning Repository. To do so, you can call the function " + "tsfresh.examples.robot_execution_failures.download_robot_execution_failures" +) UCI_MLD_REF_URL = "https://raw.githubusercontent.com/MaxBenChrist/robot-failure-dataset/master/lp1.data.txt" module_path = os.path.dirname(__file__) -data_file_name = os.path.join(module_path, 'data', 'robotfailure-mld', 'lp1.data') +data_file_name = os.path.join(module_path, "data", "robotfailure-mld", "lp1.data") def download_robot_execution_failures(file_name=data_file_name): @@ -52,21 +54,27 @@ def download_robot_execution_failures(file_name=data_file_name): >>> download_robot_execution_failures() """ if os.path.exists(file_name): - _logger.warning("You have already downloaded the Robot Execution Failures LP1 Data Set.") + _logger.warning( + "You have already downloaded the Robot Execution Failures LP1 Data Set." + ) return os.makedirs(os.path.dirname(file_name), exist_ok=True) if not os.access(os.path.dirname(file_name), os.W_OK): - raise RuntimeError("You don't have the necessary permissions to download the Robot Execution Failures LP1 Data " - "Set into the module path. Consider installing the module in a virtualenv you " - "own or run this function with appropriate permissions.") + raise RuntimeError( + "You don't have the necessary permissions to download the Robot Execution Failures LP1 Data " + "Set into the module path. Consider installing the module in a virtualenv you " + "own or run this function with appropriate permissions." + ) r = requests.get(UCI_MLD_REF_URL) if r.status_code != 200: - raise RuntimeError("Could not download the Robot Execution Failures LP1 Data Set from the UCI Machine Learning " - "Repository. HTTP status code: {}".format(r.status_code)) + raise RuntimeError( + "Could not download the Robot Execution Failures LP1 Data Set from the UCI Machine Learning " + "Repository. HTTP status code: {}".format(r.status_code) + ) with open(file_name, "w") as f: f.write(r.text) @@ -102,20 +110,22 @@ def load_robot_execution_failures(multiclass=False, file_name=data_file_name): for line in f.readlines(): # New sample --> increase id, reset time and determine target - if line[0] not in ['\t', '\n']: + if line[0] not in ["\t", "\n"]: cur_id += 1 time = 0 if multiclass: id_to_target[cur_id] = line.strip() else: - id_to_target[cur_id] = (line.strip() == 'normal') + id_to_target[cur_id] = line.strip() == "normal" # Data row --> split and convert values, create complete df row - elif line[0] == '\t': - values = list(map(int, line.split('\t')[1:])) + elif line[0] == "\t": + values = list(map(int, line.split("\t")[1:])) df_rows.append([cur_id, time] + values) time += 1 - df = pd.DataFrame(df_rows, columns=['id', 'time', 'F_x', 'F_y', 'F_z', 'T_x', 'T_y', 'T_z']) + df = pd.DataFrame( + df_rows, columns=["id", "time", "F_x", "F_y", "F_z", "T_x", "T_y", "T_z"] + ) y = pd.Series(id_to_target) return df, y diff --git a/tsfresh/feature_extraction/__init__.py b/tsfresh/feature_extraction/__init__.py index e5a5be25e..e6c7afea1 100644 --- a/tsfresh/feature_extraction/__init__.py +++ b/tsfresh/feature_extraction/__init__.py @@ -3,5 +3,8 @@ """ from tsfresh.feature_extraction.extraction import extract_features -from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, \ - EfficientFCParameters +from tsfresh.feature_extraction.settings import ( + ComprehensiveFCParameters, + MinimalFCParameters, + EfficientFCParameters, +) diff --git a/tsfresh/feature_extraction/data.py b/tsfresh/feature_extraction/data.py index 15da29384..4b8b01b9f 100644 --- a/tsfresh/feature_extraction/data.py +++ b/tsfresh/feature_extraction/data.py @@ -3,6 +3,7 @@ from typing import Iterable, Sized import pandas as pd + try: from dask import dataframe as dd except ImportError: # pragma: no cover @@ -14,7 +15,9 @@ def wrapped_feature_extraction(x): if column_sort is not None: x = x.sort_values(column_sort) - chunk = Timeseries(x[column_id].iloc[0], x[column_kind].iloc[0], x[column_value]) + chunk = Timeseries( + x[column_id].iloc[0], x[column_kind].iloc[0], x[column_value] + ) result = f(chunk, **kwargs) result = pd.DataFrame(result, columns=[column_id, "variable", "value"]) @@ -25,7 +28,7 @@ def wrapped_feature_extraction(x): return wrapped_feature_extraction -class Timeseries(namedtuple('Timeseries', ['id', 'kind', 'data'])): +class Timeseries(namedtuple("Timeseries", ["id", "kind", "data"])): """ Timeseries tuple used for feature extraction. @@ -45,6 +48,7 @@ class TsData: Timeseries instances (which distributors can use to apply the function on). Other methods can be overwritten if a more efficient solution exists for the underlying data store. """ + pass @@ -53,6 +57,7 @@ class PartitionedTsData(Iterable[Timeseries], Sized, TsData): Special class of TsData, which can be partitioned. Derived classes should implement __iter__ and __len__. """ + def __init__(self, df, column_id): self.df_id_type = df[column_id].dtype @@ -108,10 +113,14 @@ def _check_colname(*columns): for col in columns: if str(col).endswith("_"): - raise ValueError("Dict keys are not allowed to end with '_': {}".format(col)) + raise ValueError( + "Dict keys are not allowed to end with '_': {}".format(col) + ) if "__" in str(col): - raise ValueError("Dict keys are not allowed to contain '__': {}".format(col)) + raise ValueError( + "Dict keys are not allowed to contain '__': {}".format(col) + ) def _check_nan(df, *columns): @@ -140,7 +149,9 @@ def _get_value_columns(df, *other_columns): value_columns = [col for col in df.columns if col not in other_columns] if len(value_columns) == 0: - raise ValueError("Could not guess the value column! Please hand it to the function as an argument.") + raise ValueError( + "Could not guess the value column! Please hand it to the function as an argument." + ) return value_columns @@ -225,11 +236,15 @@ def __init__(self, df, column_id, column_kind, column_value=None, column_sort=No raise ValueError("A value for column_kind needs to be supplied") if column_value is None: - possible_value_columns = _get_value_columns(df, column_id, column_sort, column_kind) + possible_value_columns = _get_value_columns( + df, column_id, column_sort, column_kind + ) if len(possible_value_columns) != 1: - raise ValueError("Could not guess the value column, as the number of unused columns os not equal to 1." - f"These columns where currently unused: {','.join(possible_value_columns)}" - "Please hand it to the function as an argument.") + raise ValueError( + "Could not guess the value column, as the number of unused columns os not equal to 1." + f"These columns where currently unused: {','.join(possible_value_columns)}" + "Please hand it to the function as an argument." + ) self.column_value = possible_value_columns[0] else: self.column_value = column_value @@ -281,10 +296,14 @@ def __init__(self, ts_dict, column_id, column_value, column_sort=None): for key, df in ts_dict.items(): _check_nan(df, column_sort) - self.grouped_dict = {key: df.sort_values([column_sort]).groupby(column_id) - for key, df in ts_dict.items()} + self.grouped_dict = { + key: df.sort_values([column_sort]).groupby(column_id) + for key, df in ts_dict.items() + } else: - self.grouped_dict = {key: df.groupby(column_id) for key, df in ts_dict.items()} + self.grouped_dict = { + key: df.groupby(column_id) for key, df in ts_dict.items() + } super().__init__(df, column_id) @@ -298,7 +317,9 @@ def __len__(self): class DaskTsAdapter(TsData): - def __init__(self, df, column_id, column_kind=None, column_value=None, column_sort=None): + def __init__( + self, df, column_id, column_kind=None, column_value=None, column_sort=None + ): if column_id is None: raise ValueError("column_id must be set") @@ -306,7 +327,9 @@ def __init__(self, df, column_id, column_kind=None, column_value=None, column_so raise ValueError(f"Column not found: {column_id}") # Get all columns, which are not id, kind or sort - possible_value_columns = _get_value_columns(df, column_id, column_sort, column_kind) + possible_value_columns = _get_value_columns( + df, column_id, column_sort, column_kind + ) # The user has already a kind column. That means we just need to group by id (and additionally by id) if column_kind is not None: @@ -318,7 +341,9 @@ def __init__(self, df, column_id, column_kind=None, column_value=None, column_so # We assume the last remaining column is the value - but there needs to be one! if column_value is None: if len(possible_value_columns) != 1: - raise ValueError("Could not guess the value column! Please hand it to the function as an argument.") + raise ValueError( + "Could not guess the value column! Please hand it to the function as an argument." + ) column_value = possible_value_columns[0] else: # Ok, the user has no kind, so it is in Wide format. @@ -342,8 +367,12 @@ def __init__(self, df, column_id, column_kind=None, column_value=None, column_so id_vars = [column_id, column_sort] if column_sort else [column_id] # Now melt and group - df_melted = df.melt(id_vars=id_vars, value_vars=value_vars, - var_name=column_kind, value_name=column_value) + df_melted = df.melt( + id_vars=id_vars, + value_vars=value_vars, + var_name=column_kind, + value_name=column_value, + ) self.df = df_melted.groupby([column_id, column_kind]) @@ -361,8 +390,14 @@ def apply(self, f, meta, **kwargs): After the call, turn it back into pandas dataframes for further processing. """ - bound_function = _binding_helper(f, kwargs, self.column_sort, self.column_id, - self.column_kind, self.column_value) + bound_function = _binding_helper( + f, + kwargs, + self.column_sort, + self.column_id, + self.column_kind, + self.column_value, + ) return self.df.apply(bound_function, meta=meta) def pivot(self, results): @@ -376,13 +411,16 @@ def pivot(self, results): """ results = results.reset_index(drop=True).persist() results = results.categorize(columns=["variable"]) - feature_table = results.pivot_table(index=self.column_id, columns="variable", - values="value", aggfunc="sum") + feature_table = results.pivot_table( + index=self.column_id, columns="variable", values="value", aggfunc="sum" + ) return feature_table -def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_sort=None): +def to_tsdata( + df, column_id=None, column_kind=None, column_value=None, column_sort=None +): """ Wrap supported data formats as a TsData object, i.e. an iterable of individual time series. @@ -426,7 +464,9 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so elif isinstance(df, pd.DataFrame): if column_kind is not None: - return LongTsFrameAdapter(df, column_id, column_kind, column_value, column_sort) + return LongTsFrameAdapter( + df, column_id, column_kind, column_value, column_sort + ) else: if column_value is not None: return WideTsFrameAdapter(df, column_id, column_sort, [column_value]) @@ -440,5 +480,7 @@ def to_tsdata(df, column_id=None, column_kind=None, column_value=None, column_so return DaskTsAdapter(df, column_id, column_kind, column_value, column_sort) else: - raise ValueError("df must be a DataFrame or a dict of DataFrames. " - "See https://tsfresh.readthedocs.io/en/latest/text/data_formats.html") + raise ValueError( + "df must be a DataFrame or a dict of DataFrames. " + "See https://tsfresh.readthedocs.io/en/latest/text/data_formats.html" + ) diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py index af5194584..9c849bad0 100644 --- a/tsfresh/feature_extraction/extraction.py +++ b/tsfresh/feature_extraction/extraction.py @@ -16,24 +16,36 @@ from tsfresh.feature_extraction.data import to_tsdata from tsfresh.feature_extraction.settings import ComprehensiveFCParameters from tsfresh.utilities import profiling -from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor, \ - DistributorBaseClass, ApplyDistributor +from tsfresh.utilities.distribution import ( + MapDistributor, + MultiprocessingDistributor, + DistributorBaseClass, + ApplyDistributor, +) from tsfresh.utilities.string_manipulation import convert_to_output_format _logger = logging.getLogger(__name__) -def extract_features(timeseries_container, default_fc_parameters=None, - kind_to_fc_parameters=None, - column_id=None, column_sort=None, column_kind=None, column_value=None, - chunksize=defaults.CHUNKSIZE, - n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, - disable_progressbar=defaults.DISABLE_PROGRESSBAR, - impute_function=defaults.IMPUTE_FUNCTION, - profile=defaults.PROFILING, - profiling_filename=defaults.PROFILING_FILENAME, - profiling_sorting=defaults.PROFILING_SORTING, - distributor=None, pivot=True): +def extract_features( + timeseries_container, + default_fc_parameters=None, + kind_to_fc_parameters=None, + column_id=None, + column_sort=None, + column_kind=None, + column_value=None, + chunksize=defaults.CHUNKSIZE, + n_jobs=defaults.N_PROCESSES, + show_warnings=defaults.SHOW_WARNINGS, + disable_progressbar=defaults.DISABLE_PROGRESSBAR, + impute_function=defaults.IMPUTE_FUNCTION, + profile=defaults.PROFILING, + profiling_filename=defaults.PROFILING_FILENAME, + profiling_sorting=defaults.PROFILING_SORTING, + distributor=None, + pivot=True, +): """ Extract features from @@ -149,17 +161,21 @@ def extract_features(timeseries_container, default_fc_parameters=None, else: warnings.simplefilter("default") - result = _do_extraction(df=timeseries_container, - column_id=column_id, column_value=column_value, - column_kind=column_kind, - column_sort=column_sort, - n_jobs=n_jobs, chunk_size=chunksize, - disable_progressbar=disable_progressbar, - show_warnings=show_warnings, - default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters, - distributor=distributor, - pivot=pivot) + result = _do_extraction( + df=timeseries_container, + column_id=column_id, + column_value=column_value, + column_kind=column_kind, + column_sort=column_sort, + n_jobs=n_jobs, + chunk_size=chunksize, + disable_progressbar=disable_progressbar, + show_warnings=show_warnings, + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + distributor=distributor, + pivot=pivot, + ) # Impute the result if requested if impute_function is not None: @@ -167,16 +183,28 @@ def extract_features(timeseries_container, default_fc_parameters=None, # Turn off profiling if it was turned on if profile: - profiling.end_profiling(profiler, filename=profiling_filename, - sorting=profiling_sorting) + profiling.end_profiling( + profiler, filename=profiling_filename, sorting=profiling_sorting + ) return result -def _do_extraction(df, column_id, column_value, column_kind, column_sort, - default_fc_parameters, kind_to_fc_parameters, - n_jobs, chunk_size, disable_progressbar, show_warnings, distributor, - pivot): +def _do_extraction( + df, + column_id, + column_value, + column_kind, + column_sort, + default_fc_parameters, + kind_to_fc_parameters, + n_jobs, + chunk_size, + disable_progressbar, + show_warnings, + distributor, + pivot, +): """ Wrapper around the _do_extraction_on_chunk, which calls it on all chunks in the data frame. A chunk is a subset of the data, with a given kind and id - so a single time series. @@ -234,27 +262,41 @@ def _do_extraction(df, column_id, column_value, column_kind, column_sort, if distributor is None: if isinstance(data, Iterable): if n_jobs == 0 or n_jobs == 1: - distributor = MapDistributor(disable_progressbar=disable_progressbar, - progressbar_title="Feature Extraction") + distributor = MapDistributor( + disable_progressbar=disable_progressbar, + progressbar_title="Feature Extraction", + ) else: - distributor = MultiprocessingDistributor(n_workers=n_jobs, - disable_progressbar=disable_progressbar, - progressbar_title="Feature Extraction", - show_warnings=show_warnings) + distributor = MultiprocessingDistributor( + n_workers=n_jobs, + disable_progressbar=disable_progressbar, + progressbar_title="Feature Extraction", + show_warnings=show_warnings, + ) else: - distributor = ApplyDistributor(meta=[(data.column_id, 'int64'), ('variable', 'object'), - ('value', 'float64')]) + distributor = ApplyDistributor( + meta=[ + (data.column_id, "int64"), + ("variable", "object"), + ("value", "float64"), + ] + ) if not isinstance(distributor, DistributorBaseClass): raise ValueError("the passed distributor is not an DistributorBaseClass object") - kwargs = dict(default_fc_parameters=default_fc_parameters, - kind_to_fc_parameters=kind_to_fc_parameters, - show_warnings=show_warnings) + kwargs = dict( + default_fc_parameters=default_fc_parameters, + kind_to_fc_parameters=kind_to_fc_parameters, + show_warnings=show_warnings, + ) - result = distributor.map_reduce(_do_extraction_on_chunk, data=data, - chunk_size=chunk_size, - function_kwargs=kwargs) + result = distributor.map_reduce( + _do_extraction_on_chunk, + data=data, + chunk_size=chunk_size, + function_kwargs=kwargs, + ) if not pivot: return result @@ -263,7 +305,9 @@ def _do_extraction(df, column_id, column_value, column_kind, column_sort, return return_df -def _do_extraction_on_chunk(chunk, default_fc_parameters, kind_to_fc_parameters, show_warnings=True): +def _do_extraction_on_chunk( + chunk, default_fc_parameters, kind_to_fc_parameters, show_warnings=True +): """ Main function of this module: use the feature calculators defined in the default_fc_parameters or kind_to_fc_parameters parameters and extract all @@ -300,9 +344,9 @@ def _f(): # If the function uses the index, pass is at as a pandas Series. # Otherwise, convert to numpy array - if getattr(func, 'input', None) == 'pd.Series': + if getattr(func, "input", None) == "pd.Series": # If it has a required index type, check that the data has the right index type. - index_type = getattr(func, 'index_type', None) + index_type = getattr(func, "index_type", None) if index_type is not None: try: assert isinstance(data.index, index_type) @@ -320,8 +364,10 @@ def _f(): result = func(x, param=parameter_list) else: if parameter_list: - result = ((convert_to_output_format(param), func(x, **param)) for param in - parameter_list) + result = ( + (convert_to_output_format(param), func(x, **param)) + for param in parameter_list + ) else: result = [("", func(x))] diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index b314eca0e..f423c0ef9 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -145,15 +145,17 @@ def _estimate_friedrich_coefficients(x, m, r): """ assert m > 0, "Order of polynomial need to be positive integer, found {}".format(m) - df = pd.DataFrame({'signal': x[:-1], 'delta': np.diff(x)}) + df = pd.DataFrame({"signal": x[:-1], "delta": np.diff(x)}) try: - df['quantiles'] = pd.qcut(df.signal, r) + df["quantiles"] = pd.qcut(df.signal, r) except ValueError: return [np.NaN] * (m + 1) - quantiles = df.groupby('quantiles') + quantiles = df.groupby("quantiles") - result = pd.DataFrame({'x_mean': quantiles.signal.mean(), 'y_mean': quantiles.delta.mean()}) + result = pd.DataFrame( + {"x_mean": quantiles.signal.mean(), "y_mean": quantiles.delta.mean()} + ) result.dropna(inplace=True) try: @@ -176,7 +178,10 @@ def _aggregate_on_chunks(x, f_agg, chunk_len): :return: A list of the aggregation function over the chunks :return type: list """ - return [getattr(x[i * chunk_len: (i + 1) * chunk_len], f_agg)() for i in range(int(np.ceil(len(x) / chunk_len)))] + return [ + getattr(x[i * chunk_len : (i + 1) * chunk_len], f_agg)() + for i in range(int(np.ceil(len(x) / chunk_len))) + ] def _into_subchunks(x, subchunk_length, every_n=1): @@ -209,11 +214,15 @@ def set_property(key, value): """ This method returns a decorator that sets the property key of the function to value """ + def decorate_func(func): setattr(func, key, value) if func.__doc__ and key == "fctype": - func.__doc__ = func.__doc__ + "\n\n *This function is of type: " + value + "*\n" + func.__doc__ = ( + func.__doc__ + "\n\n *This function is of type: " + value + "*\n" + ) return func + return decorate_func @@ -297,8 +306,10 @@ def symmetry_looking(x, param): x = np.asarray(x) mean_median_difference = np.abs(np.mean(x) - np.median(x)) max_min_difference = np.max(x) - np.min(x) - return [("r_{}".format(r["r"]), mean_median_difference < (r["r"] * max_min_difference)) - for r in param] + return [ + ("r_{}".format(r["r"]), mean_median_difference < (r["r"] * max_min_difference)) + for r in param + ] @set_property("fctype", "simple") @@ -403,12 +414,17 @@ def agg_autocorrelation(x, param): n = len(x) max_maxlag = max([config["maxlag"] for config in param]) - if np.abs(var) < 10**-10 or n == 1: + if np.abs(var) < 10 ** -10 or n == 1: a = [0] * len(x) else: a = acf(x, unbiased=True, fft=n > THRESHOLD_TO_USE_FFT, nlags=max_maxlag)[1:] - return [("f_agg_\"{}\"__maxlag_{}".format(config["f_agg"], config["maxlag"]), - getattr(np, config["f_agg"])(a[:int(config["maxlag"])])) for config in param] + return [ + ( + 'f_agg_"{}"__maxlag_{}'.format(config["f_agg"], config["maxlag"]), + getattr(np, config["f_agg"])(a[: int(config["maxlag"])]), + ) + for config in param + ] @set_property("fctype", "combiner") @@ -1061,9 +1077,15 @@ def fft_coefficient(x, param): :return type: pandas.Series """ - assert min([config["coeff"] for config in param]) >= 0, "Coefficients must be positive or zero." - assert {config["attr"] for config in param} <= {"imag", "real", "abs", "angle"}, \ - 'Attribute must be "real", "imag", "angle" or "abs"' + assert ( + min([config["coeff"] for config in param]) >= 0 + ), "Coefficients must be positive or zero." + assert {config["attr"] for config in param} <= { + "imag", + "real", + "abs", + "angle", + }, 'Attribute must be "real", "imag", "angle" or "abs"' fft = np.fft.rfft(x) @@ -1077,9 +1099,16 @@ def complex_agg(x, agg): elif agg == "angle": return np.angle(x, deg=True) - res = [complex_agg(fft[config["coeff"]], config["attr"]) if config["coeff"] < len(fft) - else np.NaN for config in param] - index = ['attr_"{}"__coeff_{}'.format(config["attr"], config["coeff"]) for config in param] + res = [ + complex_agg(fft[config["coeff"]], config["attr"]) + if config["coeff"] < len(fft) + else np.NaN + for config in param + ] + index = [ + 'attr_"{}"__coeff_{}'.format(config["attr"], config["coeff"]) + for config in param + ] return zip(index, res) @@ -1097,8 +1126,12 @@ def fft_aggregated(x, param): :return type: pandas.Series """ - assert {config["aggtype"] for config in param} <= {"centroid", "variance", "skew", "kurtosis"}, \ - 'Attribute must be "centroid", "variance", "skew", "kurtosis"' + assert {config["aggtype"] for config in param} <= { + "centroid", + "variance", + "skew", + "kurtosis", + }, 'Attribute must be "centroid", "variance", "skew", "kurtosis"' def get_moment(y, moment): """ @@ -1112,7 +1145,7 @@ def get_moment(y, moment): :return: the moment requested :return type: float """ - return y.dot(np.arange(len(y), dtype=float)**moment) / y.sum() + return y.dot(np.arange(len(y), dtype=float) ** moment) / y.sum() def get_centroid(y): """ @@ -1150,8 +1183,8 @@ def get_skew(y): return np.nan else: return ( - get_moment(y, 3) - 3 * get_centroid(y) * variance - get_centroid(y)**3 - ) / get_variance(y)**(1.5) + get_moment(y, 3) - 3 * get_centroid(y) * variance - get_centroid(y) ** 3 + ) / get_variance(y) ** (1.5) def get_kurtosis(y): """ @@ -1171,15 +1204,17 @@ def get_kurtosis(y): return np.nan else: return ( - get_moment(y, 4) - 4 * get_centroid(y) * get_moment(y, 3) - + 6 * get_moment(y, 2) * get_centroid(y)**2 - 3 * get_centroid(y) - ) / get_variance(y)**2 + get_moment(y, 4) + - 4 * get_centroid(y) * get_moment(y, 3) + + 6 * get_moment(y, 2) * get_centroid(y) ** 2 + - 3 * get_centroid(y) + ) / get_variance(y) ** 2 calculation = dict( centroid=get_centroid, variance=get_variance, skew=get_skew, - kurtosis=get_kurtosis + kurtosis=get_kurtosis, ) fft_abs = np.abs(np.fft.rfft(x)) @@ -1218,14 +1253,14 @@ def number_peaks(x, n): res = None for i in range(1, n + 1): - result_first = (x_reduced > _roll(x, i)[n:-n]) + result_first = x_reduced > _roll(x, i)[n:-n] if res is None: res = result_first else: res &= result_first - res &= (x_reduced > _roll(x, -i)[n:-n]) + res &= x_reduced > _roll(x, -i)[n:-n] return np.sum(res) @@ -1253,8 +1288,13 @@ def index_mass_quantile(x, param): else: # at least one value is not zero mass_centralized = np.cumsum(abs_x) / s - return [("q_{}".format(config["q"]), - (np.argmax(mass_centralized >= config["q"]) + 1) / len(x)) for config in param] + return [ + ( + "q_{}".format(config["q"]), + (np.argmax(mass_centralized >= config["q"]) + 1) / len(x), + ) + for config in param + ] @set_property("fctype", "simple") @@ -1273,7 +1313,9 @@ def number_cwt_peaks(x, n): :return: the value of this feature :return type: int """ - return len(find_peaks_cwt(vector=x, widths=np.array(list(range(1, n + 1))), wavelet=ricker)) + return len( + find_peaks_cwt(vector=x, widths=np.array(list(range(1, n + 1))), wavelet=ricker) + ) @set_property("fctype", "combiner") @@ -1297,8 +1339,10 @@ def linear_trend(x, param): # todo: we could use the index of the DataFrame here linReg = linregress(range(len(x)), x) - return [("attr_\"{}\"".format(config["attr"]), getattr(linReg, config["attr"])) - for config in param] + return [ + ('attr_"{}"'.format(config["attr"]), getattr(linReg, config["attr"])) + for config in param + ] @set_property("fctype", "combiner") @@ -1369,15 +1413,21 @@ def spkt_welch_density(x, param): coeff = [config["coeff"] for config in param] indices = ["coeff_{}".format(i) for i in coeff] - if len(pxx) <= np.max(coeff): # There are fewer data points in the time series than requested coefficients + if len(pxx) <= np.max( + coeff + ): # There are fewer data points in the time series than requested coefficients # filter coefficients that are not contained in pxx reduced_coeff = [coefficient for coefficient in coeff if len(pxx) > coefficient] - not_calculated_coefficients = [coefficient for coefficient in coeff - if coefficient not in reduced_coeff] + not_calculated_coefficients = [ + coefficient for coefficient in coeff if coefficient not in reduced_coeff + ] # Fill up the rest of the requested coefficients with np.NaNs - return zip(indices, list(pxx[reduced_coeff]) + [np.NaN] * len(not_calculated_coefficients)) + return zip( + indices, + list(pxx[reduced_coeff]) + [np.NaN] * len(not_calculated_coefficients), + ) else: return zip(indices, pxx[coeff]) @@ -1418,7 +1468,9 @@ def ar_coefficient(x, param): if k not in calculated_ar_params: try: calculated_AR = AR(x_as_list) - calculated_ar_params[k] = calculated_AR.fit(maxlag=k, solver="mle").params + calculated_ar_params[k] = calculated_AR.fit( + maxlag=k, solver="mle" + ).params except (LinAlgError, ValueError): calculated_ar_params[k] = [np.NaN] * k @@ -1521,7 +1573,9 @@ def time_reversal_asymmetry_statistic(x, lag): else: one_lag = _roll(x, -lag) two_lag = _roll(x, 2 * -lag) - return np.mean((two_lag * two_lag * one_lag - one_lag * x * x)[0:(n - 2 * lag)]) + return np.mean( + (two_lag * two_lag * one_lag - one_lag * x * x)[0 : (n - 2 * lag)] + ) @set_property("fctype", "simple") @@ -1563,7 +1617,7 @@ def c3(x, lag): if 2 * lag >= n: return 0 else: - return np.mean((_roll(x, 2 * -lag) * _roll(x, -lag) * x)[0:(n - 2 * lag)]) + return np.mean((_roll(x, 2 * -lag) * _roll(x, -lag) * x)[0 : (n - 2 * lag)]) @set_property("fctype", "simple") @@ -1580,7 +1634,9 @@ def mean_n_absolute_max(x, number_of_maxima): :return type: float """ - assert number_of_maxima > 0, f" number_of_maxima={number_of_maxima} which is not greater than 1" + assert ( + number_of_maxima > 0 + ), f" number_of_maxima={number_of_maxima} which is not greater than 1" n_absolute_maximum_values = np.sort(np.absolute(x))[-number_of_maxima:] @@ -1616,7 +1672,7 @@ def binned_entropy(x, max_bins): hist, bin_edges = np.histogram(x, bins=max_bins) probs = hist / x.size probs[probs == 0] = 1.0 - return - np.sum(probs * np.log(probs)) + return -np.sum(probs * np.log(probs)) # todo - include latex formula @@ -1645,7 +1701,9 @@ def sample_entropy(x): return np.nan m = 2 # common value for m, according to wikipedia... - tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... + tolerance = 0.2 * np.std( + x + ) # 0.2 is a common value for r, according to wikipedia... # Split time series and save all templates of length m # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4] @@ -1669,7 +1727,9 @@ def sample_entropy(x): # Similar for computing A xmp1 = _into_subchunks(x, m + 1) - A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1]) + A = np.sum( + [np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1] + ) # Return SampEn return -np.log(A / B) @@ -1716,9 +1776,14 @@ def approximate_entropy(x, m, r): return 0 def _phi(m): - x_re = np.array([x[i:i + m] for i in range(N - m + 1)]) - C = np.sum(np.max(np.abs(x_re[:, np.newaxis] - x_re[np.newaxis, :]), - axis=2) <= r, axis=0) / (N - m + 1) + x_re = np.array([x[i : i + m] for i in range(N - m + 1)]) + C = ( + np.sum( + np.max(np.abs(x_re[:, np.newaxis] - x_re[np.newaxis, :]), axis=2) <= r, + axis=0, + ) + / (N - m + 1) + ) return np.sum(np.log(C)) / (N - m + 1.0) return np.abs(_phi(m) - _phi(m + 1)) @@ -1760,7 +1825,7 @@ def lempel_ziv_complexity(x, bins): x = np.asarray(x) bins = np.linspace(np.min(x), np.max(x), bins + 1)[1:] - sequence = np.searchsorted(bins, x, side='left') + sequence = np.searchsorted(bins, x, side="left") sub_strings = set() n = len(sequence) @@ -1769,7 +1834,7 @@ def lempel_ziv_complexity(x, bins): inc = 1 while ind + inc <= n: # convert to tuple in order to make it hashable - sub_str = tuple(sequence[ind:ind + inc]) + sub_str = tuple(sequence[ind : ind + inc]) if sub_str in sub_strings: inc += 1 else: @@ -1862,7 +1927,7 @@ def autocorrelation(x, lag): if len(x) < lag: return np.nan # Slice the relevant subseries based on the lag - y1 = x[:(len(x) - lag)] + y1 = x[: (len(x) - lag)] y2 = x[lag:] # Subtract the mean of the whole series x x_mean = np.mean(x) @@ -2029,11 +2094,13 @@ def friedrich_coefficients(x, param): res = {} for parameter_combination in param: - m = parameter_combination['m'] - r = parameter_combination['r'] + m = parameter_combination["m"] + r = parameter_combination["r"] coeff = parameter_combination["coeff"] - assert coeff >= 0, "Coefficients must be positive or zero. Found {}".format(coeff) + assert coeff >= 0, "Coefficients must be positive or zero. Found {}".format( + coeff + ) # calculate the current friedrich coefficients if they do not exist yet if m not in calculated or r not in calculated[m]: @@ -2121,7 +2188,9 @@ def agg_linear_trend(x, param): calculated_agg[f_agg][chunk_len] = np.NaN else: aggregate_result = _aggregate_on_chunks(x, f_agg, chunk_len) - lin_reg_result = linregress(range(len(aggregate_result)), aggregate_result) + lin_reg_result = linregress( + range(len(aggregate_result)), aggregate_result + ) calculated_agg[f_agg][chunk_len] = lin_reg_result attr = parameter_combination["attr"] @@ -2131,7 +2200,9 @@ def agg_linear_trend(x, param): else: res_data.append(getattr(calculated_agg[f_agg][chunk_len], attr)) - res_index.append("attr_\"{}\"__chunk_len_{}__f_agg_\"{}\"".format(attr, chunk_len, f_agg)) + res_index.append( + 'attr_"{}"__chunk_len_{}__f_agg_"{}"'.format(attr, chunk_len, f_agg) + ) return zip(res_index, res_data) @@ -2171,9 +2242,14 @@ def energy_ratio_by_chunks(x, param): if full_series_energy == 0: res_data.append(np.NaN) else: - res_data.append(np.sum(np.array_split(x, num_segments)[segment_focus] ** 2.0) / full_series_energy) + res_data.append( + np.sum(np.array_split(x, num_segments)[segment_focus] ** 2.0) + / full_series_energy + ) - res_index.append("num_segments_{}__segment_focus_{}".format(num_segments, segment_focus)) + res_index.append( + "num_segments_{}__segment_focus_{}".format(num_segments, segment_focus) + ) # Materialize as list for Python 3 compatibility with name handling return list(zip(res_index, res_data)) @@ -2209,8 +2285,10 @@ def linear_trend_timewise(x, param): linReg = linregress(times_hours, x.values) - return [("attr_\"{}\"".format(config["attr"]), getattr(linReg, config["attr"])) - for config in param] + return [ + ('attr_"{}"'.format(config["attr"]), getattr(linReg, config["attr"])) + for config in param + ] @set_property("fctype", "simple") @@ -2226,7 +2304,7 @@ def count_above(x, t): :return: the value of this feature :return type: float """ - return np.sum(x >= t)/len(x) + return np.sum(x >= t) / len(x) @set_property("fctype", "simple") @@ -2242,42 +2320,44 @@ def count_below(x, t): :return: the value of this feature :return type: float """ - return np.sum(x <= t)/len(x) + return np.sum(x <= t) / len(x) @set_property("fctype", "simple") def benford_correlation(x): """ - Useful for anomaly detection applications [1][2]. Returns the correlation from first digit distribution when - compared to the Newcomb-Benford's Law distribution [3][4]. + Useful for anomaly detection applications [1][2]. Returns the correlation from first digit distribution when + compared to the Newcomb-Benford's Law distribution [3][4]. - .. math:: + .. math:: - P(d)=\\log_{10}\\left(1+\\frac{1}{d}\\right) + P(d)=\\log_{10}\\left(1+\\frac{1}{d}\\right) - where :math:`P(d)` is the Newcomb-Benford distribution for :math:`d` that is the leading digit of the number - {1, 2, 3, 4, 5, 6, 7, 8, 9}. + where :math:`P(d)` is the Newcomb-Benford distribution for :math:`d` that is the leading digit of the number + {1, 2, 3, 4, 5, 6, 7, 8, 9}. - .. rubric:: References + .. rubric:: References - | [1] A Statistical Derivation of the Significant-Digit Law, Theodore P. Hill, Statistical Science, 1995 - | [2] The significant-digit phenomenon, Theodore P. Hill, The American Mathematical Monthly, 1995 - | [3] The law of anomalous numbers, Frank Benford, Proceedings of the American philosophical society, 1938 - | [4] Note on the frequency of use of the different digits in natural numbers, Simon Newcomb, American Journal of - | mathematics, 1881 - - :param x: the time series to calculate the feature of - :type x: numpy.ndarray - :return: the value of this feature - :return type: float - """ + | [1] A Statistical Derivation of the Significant-Digit Law, Theodore P. Hill, Statistical Science, 1995 + | [2] The significant-digit phenomenon, Theodore P. Hill, The American Mathematical Monthly, 1995 + | [3] The law of anomalous numbers, Frank Benford, Proceedings of the American philosophical society, 1938 + | [4] Note on the frequency of use of the different digits in natural numbers, Simon Newcomb, American Journal of + | mathematics, 1881 + + :param x: the time series to calculate the feature of + :type x: numpy.ndarray + :return: the value of this feature + :return type: float + """ x = np.asarray(x) # retrieve first digit from data - x = np.array([int(str(np.format_float_scientific(i))[:1]) for i in np.abs(np.nan_to_num(x))]) + x = np.array( + [int(str(np.format_float_scientific(i))[:1]) for i in np.abs(np.nan_to_num(x))] + ) # benford distribution - benford_distribution = np.array([np.log10(1 + 1/n) for n in range(1, 10)]) + benford_distribution = np.array([np.log10(1 + 1 / n) for n in range(1, 10)]) data_distribution = np.array([(x == n).mean() for n in range(1, 10)]) @@ -2314,10 +2394,12 @@ def _calculate_mp(**kwargs): """Calculate the matrix profile using the specified window, or the max subsequence if no window is specified""" try: if "windows" in kwargs: - m_p = mp.compute(x, **kwargs)['mp'] + m_p = mp.compute(x, **kwargs)["mp"] else: - m_p = mp.algorithms.maximum_subsequence(x, include_pmp=True, **kwargs)['pmp'][-1] + m_p = mp.algorithms.maximum_subsequence(x, include_pmp=True, **kwargs)[ + "pmp" + ][-1] return m_p @@ -2333,7 +2415,7 @@ def _calculate_mp(**kwargs): for kwargs in param: kwargs = kwargs.copy() key = convert_to_output_format(kwargs) - feature = kwargs.pop('feature') + feature = kwargs.pop("feature") featureless_key = convert_to_output_format(kwargs) if featureless_key not in matrix_profiles: @@ -2402,8 +2484,8 @@ def query_similarity_count(x, param): for i, kwargs in enumerate(param): key = convert_to_output_format(kwargs) normalize = kwargs.get("normalize", True) - threshold = kwargs.get('threshold', 0.0) - Q = kwargs.get('query', None) + threshold = kwargs.get("threshold", 0.0) + Q = kwargs.get("query", None) Q = np.asarray(Q).astype(float) count = np.nan if Q is not None and Q.size >= 3: diff --git a/tsfresh/feature_extraction/settings.py b/tsfresh/feature_extraction/settings.py index fcdcca285..fad2efbb1 100644 --- a/tsfresh/feature_extraction/settings.py +++ b/tsfresh/feature_extraction/settings.py @@ -52,11 +52,13 @@ def from_columns(columns, columns_to_ignore=None): raise TypeError("Column name {} should be a string or unicode".format(col)) # Split according to our separator into , , - parts = col.split('__') + parts = col.split("__") n_parts = len(parts) if n_parts == 1: - raise ValueError("Splitting of columnname {} resulted in only one part.".format(col)) + raise ValueError( + "Splitting of columnname {} resulted in only one part.".format(col) + ) kind = parts[0] feature_name = parts[1] @@ -89,6 +91,7 @@ class PickeableSettings(UserDict): cloudpickle is able to pickle much more functions than pickle can and pickle will only see the already encoded keys (not the raw functions). """ + def __getstate__(self): """Called on pickling. Encode the keys by cloudpickling them""" state = {cloudpickle.dumps(key): value for key, value in self.items()} @@ -127,60 +130,129 @@ def __init__(self): name_to_param = {} for name, func in feature_calculators.__dict__.items(): - if callable(func) and hasattr(func, "fctype") and len(getfullargspec(func).args) == 1: + if ( + callable(func) + and hasattr(func, "fctype") + and len(getfullargspec(func).args) == 1 + ): name_to_param[name] = None - name_to_param.update({ - "time_reversal_asymmetry_statistic": [{"lag": lag} for lag in range(1, 4)], - "c3": [{"lag": lag} for lag in range(1, 4)], - "cid_ce": [{"normalize": True}, {"normalize": False}], - "symmetry_looking": [{"r": r * 0.05} for r in range(20)], - "large_standard_deviation": [{"r": r * 0.05} for r in range(1, 20)], - "quantile": [{"q": q} for q in [.1, .2, .3, .4, .6, .7, .8, .9]], - "autocorrelation": [{"lag": lag} for lag in range(10)], - "agg_autocorrelation": [{"f_agg": s, "maxlag": 40} for s in ["mean", "median", "var"]], - "partial_autocorrelation": [{"lag": lag} for lag in range(10)], - "number_cwt_peaks": [{"n": n} for n in [1, 5]], - "number_peaks": [{"n": n} for n in [1, 3, 5, 10, 50]], - "binned_entropy": [{"max_bins": max_bins} for max_bins in [10]], - "index_mass_quantile": [{"q": q} for q in [.1, .2, .3, .4, .6, .7, .8, .9]], - "cwt_coefficients": [{"widths": width, "coeff": coeff, "w": w} for - width in [(2, 5, 10, 20)] for coeff in range(15) for w in (2, 5, 10, 20)], - "spkt_welch_density": [{"coeff": coeff} for coeff in [2, 5, 8]], - "ar_coefficient": [{"coeff": coeff, "k": k} for coeff in range(10 + 1) for k in [10]], - "change_quantiles": [{"ql": ql, "qh": qh, "isabs": b, "f_agg": f} - for ql in [0., .2, .4, .6, .8] for qh in [.2, .4, .6, .8, 1.] - for b in [False, True] for f in ["mean", "var"] if ql < qh], - "fft_coefficient": [{"coeff": k, "attr": a} for a, k in - product(["real", "imag", "abs", "angle"], range(100))], - "fft_aggregated": [{"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"]], - "value_count": [{"value": value} for value in [0, 1, -1]], - "range_count": [{"min": -1, "max": 1}, {"min": -1e12, "max": 0}, - {"min": 0, "max": 1e12}], - "approximate_entropy": [{"m": 2, "r": r} for r in [.1, .3, .5, .7, .9]], - "friedrich_coefficients": (lambda m: [{"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1)])(3), - "max_langevin_fixed_point": [{"m": 3, "r": 30}], - "linear_trend": [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, - {"attr": "slope"}, {"attr": "stderr"}], - "agg_linear_trend": [{"attr": attr, "chunk_len": i, "f_agg": f} - for attr in ["rvalue", "intercept", "slope", "stderr"] - for i in [5, 10, 50] - for f in ["max", "min", "mean", "var"]], - "augmented_dickey_fuller": [{"attr": "teststat"}, {"attr": "pvalue"}, {"attr": "usedlag"}], - "number_crossing_m": [{"m": 0}, {"m": -1}, {"m": 1}], - "energy_ratio_by_chunks": [{"num_segments": 10, "segment_focus": i} for i in range(10)], - "ratio_beyond_r_sigma": [{"r": x} for x in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10]], - "linear_trend_timewise": [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"}, - {"attr": "slope"}, {"attr": "stderr"}], - "count_above": [{"t": 0}], - "count_below": [{"t": 0}], - "lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]], - "fourier_entropy": [{"bins": x} for x in [2, 3, 5, 10, 100]], - "permutation_entropy": [{"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7]], - "query_similarity_count": [{"query": None, "threshold": 0.0}], - "matrix_profile": [{"threshold": 0.98, "feature": f} for f in ["min", "max", "mean", "median", "25", "75"]], - "mean_n_absolute_max": [{"number_of_maxima": 3, "number_of_maxima": 5, "number_of_maxima": 7}] - }) + name_to_param.update( + { + "time_reversal_asymmetry_statistic": [ + {"lag": lag} for lag in range(1, 4) + ], + "c3": [{"lag": lag} for lag in range(1, 4)], + "cid_ce": [{"normalize": True}, {"normalize": False}], + "symmetry_looking": [{"r": r * 0.05} for r in range(20)], + "large_standard_deviation": [{"r": r * 0.05} for r in range(1, 20)], + "quantile": [ + {"q": q} for q in [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9] + ], + "autocorrelation": [{"lag": lag} for lag in range(10)], + "agg_autocorrelation": [ + {"f_agg": s, "maxlag": 40} for s in ["mean", "median", "var"] + ], + "partial_autocorrelation": [{"lag": lag} for lag in range(10)], + "number_cwt_peaks": [{"n": n} for n in [1, 5]], + "number_peaks": [{"n": n} for n in [1, 3, 5, 10, 50]], + "binned_entropy": [{"max_bins": max_bins} for max_bins in [10]], + "index_mass_quantile": [ + {"q": q} for q in [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9] + ], + "cwt_coefficients": [ + {"widths": width, "coeff": coeff, "w": w} + for width in [(2, 5, 10, 20)] + for coeff in range(15) + for w in (2, 5, 10, 20) + ], + "spkt_welch_density": [{"coeff": coeff} for coeff in [2, 5, 8]], + "ar_coefficient": [ + {"coeff": coeff, "k": k} for coeff in range(10 + 1) for k in [10] + ], + "change_quantiles": [ + {"ql": ql, "qh": qh, "isabs": b, "f_agg": f} + for ql in [0.0, 0.2, 0.4, 0.6, 0.8] + for qh in [0.2, 0.4, 0.6, 0.8, 1.0] + for b in [False, True] + for f in ["mean", "var"] + if ql < qh + ], + "fft_coefficient": [ + {"coeff": k, "attr": a} + for a, k in product(["real", "imag", "abs", "angle"], range(100)) + ], + "fft_aggregated": [ + {"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"] + ], + "value_count": [{"value": value} for value in [0, 1, -1]], + "range_count": [ + {"min": -1, "max": 1}, + {"min": -1e12, "max": 0}, + {"min": 0, "max": 1e12}, + ], + "approximate_entropy": [ + {"m": 2, "r": r} for r in [0.1, 0.3, 0.5, 0.7, 0.9] + ], + "friedrich_coefficients": ( + lambda m: [ + {"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1) + ] + )(3), + "max_langevin_fixed_point": [{"m": 3, "r": 30}], + "linear_trend": [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ], + "agg_linear_trend": [ + {"attr": attr, "chunk_len": i, "f_agg": f} + for attr in ["rvalue", "intercept", "slope", "stderr"] + for i in [5, 10, 50] + for f in ["max", "min", "mean", "var"] + ], + "augmented_dickey_fuller": [ + {"attr": "teststat"}, + {"attr": "pvalue"}, + {"attr": "usedlag"}, + ], + "number_crossing_m": [{"m": 0}, {"m": -1}, {"m": 1}], + "energy_ratio_by_chunks": [ + {"num_segments": 10, "segment_focus": i} for i in range(10) + ], + "ratio_beyond_r_sigma": [ + {"r": x} for x in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10] + ], + "linear_trend_timewise": [ + {"attr": "pvalue"}, + {"attr": "rvalue"}, + {"attr": "intercept"}, + {"attr": "slope"}, + {"attr": "stderr"}, + ], + "count_above": [{"t": 0}], + "count_below": [{"t": 0}], + "lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]], + "fourier_entropy": [{"bins": x} for x in [2, 3, 5, 10, 100]], + "permutation_entropy": [ + {"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7] + ], + "query_similarity_count": [{"query": None, "threshold": 0.0}], + "matrix_profile": [ + {"threshold": 0.98, "feature": f} + for f in ["min", "max", "mean", "median", "25", "75"] + ], + "mean_n_absolute_max": [ + { + "number_of_maxima": 3, + "number_of_maxima": 5, + "number_of_maxima": 7, + } + ], + } + ) super().__init__(name_to_param) @@ -205,7 +277,9 @@ def __init__(self): ComprehensiveFCParameters.__init__(self) for fname, f in feature_calculators.__dict__.items(): - if fname in self and (not hasattr(f, "minimal") or not getattr(f, "minimal")): + if fname in self and ( + not hasattr(f, "minimal") or not getattr(f, "minimal") + ): del self[fname] diff --git a/tsfresh/feature_selection/significance_tests.py b/tsfresh/feature_selection/significance_tests.py index f3acd3c1c..21f930c38 100644 --- a/tsfresh/feature_selection/significance_tests.py +++ b/tsfresh/feature_selection/significance_tests.py @@ -72,8 +72,7 @@ def target_binary_feature_binary_test(x, y): n_y1_x1 = np.sum(y[x == x1] == y1) n_y0_x1 = len(y[x == x1]) - n_y1_x1 - table = np.array([[n_y1_x1, n_y1_x0], - [n_y0_x1, n_y0_x0]]) + table = np.array([[n_y1_x1, n_y1_x0], [n_y0_x1, n_y0_x0]]) # Perform the Fisher test oddsratio, p_value = stats.fisher_exact(table, alternative="two-sided") @@ -115,17 +114,21 @@ def target_binary_feature_real_test(x, y, test): x_y1 = x[y == y1] x_y0 = x[y == y0] - if test == 'mann': + if test == "mann": # Perform Mann-Whitney-U test - U, p_mannwhitu = stats.mannwhitneyu(x_y1, x_y0, use_continuity=True, alternative='two-sided') + U, p_mannwhitu = stats.mannwhitneyu( + x_y1, x_y0, use_continuity=True, alternative="two-sided" + ) return p_mannwhitu - elif test == 'smir': + elif test == "smir": # Perform Kolmogorov-Smirnov test KS, p_ks = stats.ks_2samp(x_y1, x_y0) return p_ks else: - raise ValueError("Please use a valid entry for test_for_binary_target_real_feature. " + - "Valid entries are 'mann' and 'smir'.") + raise ValueError( + "Please use a valid entry for test_for_binary_target_real_feature. " + + "Valid entries are 'mann' and 'smir'." + ) def target_real_feature_binary_test(x, y): @@ -224,9 +227,12 @@ def __check_for_binary_target(y): if len(set(y)) > 2: raise ValueError("Target is not binary!") - warnings.warn("The binary target should have " - "values 1 and 0 (or True and False). " - "Instead found" + str(set(y)), RuntimeWarning) + warnings.warn( + "The binary target should have " + "values 1 and 0 (or True and False). " + "Instead found" + str(set(y)), + RuntimeWarning, + ) def __check_for_binary_feature(x): @@ -244,11 +250,16 @@ def __check_for_binary_feature(x): """ if not set(x) == {0, 1}: if len(set(x)) > 2: - raise ValueError("[target_binary_feature_binary_test] Feature is not binary!") + raise ValueError( + "[target_binary_feature_binary_test] Feature is not binary!" + ) - warnings.warn("A binary feature should have only " - "values 1 and 0 (incl. True and False). " - "Instead found " + str(set(x)) + " in feature ''" + str(x.name) + "''.", RuntimeWarning) + warnings.warn( + "A binary feature should have only " + "values 1 and 0 (incl. True and False). " + "Instead found " + str(set(x)) + " in feature ''" + str(x.name) + "''.", + RuntimeWarning, + ) def _check_for_nans(x, y): @@ -261,6 +272,6 @@ def _check_for_nans(x, y): :raises: `ValueError` if target or feature contains NaNs. """ if np.isnan(x.values).any(): - raise ValueError('Feature {} contains NaN values'.format(x.name)) + raise ValueError("Feature {} contains NaN values".format(x.name)) elif np.isnan(y.values).any(): - raise ValueError('Target contains NaN values') + raise ValueError("Target contains NaN values") diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py index 72948a660..795fa6631 100644 --- a/tsfresh/scripts/measure_execution_time.py +++ b/tsfresh/scripts/measure_execution_time.py @@ -5,7 +5,11 @@ # Do these calculations in a controlled environment # (e.g. a cloud provider VM) # You will need to have b2luigi installed. -from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features +from tsfresh.feature_extraction import ( + ComprehensiveFCParameters, + MinimalFCParameters, + extract_features, +) import pandas as pd import numpy as np @@ -16,6 +20,7 @@ class DataCreationTask(luigi.Task): """Create random data for testing""" + num_ids = luigi.IntParameter(default=100) time_series_length = luigi.IntParameter() random_seed = luigi.IntParameter() @@ -26,14 +31,18 @@ def output(self): def run(self): np.random.seed(self.random_seed) - df = pd.concat([ - pd.DataFrame({ - "id": [i] * self.time_series_length, - "time": range(self.time_series_length), - "value": np.random.randn(self.time_series_length) - }) - for i in range(self.num_ids) - ]) + df = pd.concat( + [ + pd.DataFrame( + { + "id": [i] * self.time_series_length, + "time": range(self.time_series_length), + "value": np.random.randn(self.time_series_length), + } + ) + for i in range(self.num_ids) + ] + ) with self._get_output_target("data.csv").open("w") as f: df.to_csv(f) @@ -42,6 +51,7 @@ def run(self): @luigi.requires(DataCreationTask) class TimingTask(luigi.Task): """Run tsfresh with the given parameters""" + feature_parameter = luigi.DictParameter(hashed=True) n_jobs = luigi.IntParameter() try_number = luigi.IntParameter() @@ -56,9 +66,14 @@ def run(self): df = pd.read_csv(f) start_time = time() - extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, - default_fc_parameters=self.feature_parameter, - disable_progressbar=True) + extract_features( + df, + column_id="id", + column_sort="time", + n_jobs=self.n_jobs, + default_fc_parameters=self.feature_parameter, + disable_progressbar=True, + ) end_time = time() single_parameter_name = list(self.feature_parameter.keys())[0] @@ -69,7 +84,9 @@ def run(self): "n_ids": self.num_ids, "n_jobs": self.n_jobs, "feature": single_parameter_name, - "number_parameters": len(single_parameter_params) if single_parameter_params else 0, + "number_parameters": len(single_parameter_params) + if single_parameter_params + else 0, "time_series_length": int((df["id"] == 0).sum()), "try_number": self.try_number, } @@ -81,6 +98,7 @@ def run(self): @luigi.requires(DataCreationTask) class FullTimingTask(luigi.Task): """Run tsfresh with all calculators for comparison""" + n_jobs = luigi.IntParameter() def output(self): @@ -93,8 +111,13 @@ def run(self): df = pd.read_csv(f) start_time = time() - extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, - disable_progressbar=True) + extract_features( + df, + column_id="id", + column_sort="time", + n_jobs=self.n_jobs, + disable_progressbar=True, + ) end_time = time() result_json = { @@ -110,6 +133,7 @@ def run(self): class CombinerTask(luigi.Task): """Collect all tasks into a single result.csv file""" + def complete(self): return False @@ -117,14 +141,18 @@ def requires(self): settings = ComprehensiveFCParameters() for job in [0, 1, 4]: for time_series_length in [100, 500, 1000, 5000]: - yield FullTimingTask(time_series_length=time_series_length, - n_jobs=job, - num_ids=10, - random_seed=42) - yield FullTimingTask(time_series_length=time_series_length, - n_jobs=job, - num_ids=100, - random_seed=42) + yield FullTimingTask( + time_series_length=time_series_length, + n_jobs=job, + num_ids=10, + random_seed=42, + ) + yield FullTimingTask( + time_series_length=time_series_length, + n_jobs=job, + num_ids=100, + random_seed=42, + ) for feature_name in settings: yield TimingTask( @@ -133,7 +161,7 @@ def requires(self): n_jobs=job, num_ids=100, try_number=0, - random_seed=42 + random_seed=42, ) for try_number in range(3): @@ -143,7 +171,7 @@ def requires(self): try_number=try_number, num_ids=10, time_series_length=time_series_length, - random_seed=42 + random_seed=42, ) def output(self): diff --git a/tsfresh/scripts/run_tsfresh.py b/tsfresh/scripts/run_tsfresh.py index 3705cc175..5fdc321ee 100644 --- a/tsfresh/scripts/run_tsfresh.py +++ b/tsfresh/scripts/run_tsfresh.py @@ -43,40 +43,63 @@ def _preprocess(df): def main(console_args=None): - parser = argparse.ArgumentParser(description="Extract features from time series stored in a CSV file and " - "write them back into another CSV file. The time series in the CSV " - "file should either have one of the dataframe-formats described in " - "http://tsfresh.readthedocs.io/en/latest/text/data_formats.html, " - "which means you have to supply the --csv-with-headers flag " - "or should be in the form " - "[time series 1 values ..., time series 2 values ...] " - "where you should not add the --csv-with-headers flag. " - "The CSV is expected to be space-separated.") - parser.add_argument("input_file_name", help="File name of the input CSV file to read in.") - parser.add_argument("--output-file-name", help="File name of the output CSV file to write to. " - "Defaults to input_file_name.features.csv", - default=None) - - parser.add_argument("--column-sort", help="Column name to be used to sort the rows. " - "Only available when --csv-with-headers is enabled.", - default=None) - parser.add_argument("--column-kind", help="Column name where the kind column can be found." - "Only available when --csv-with-headers is enabled.", - default=None) - parser.add_argument("--column-value", help="Column name where the values can be found." - "Only available when --csv-with-headers is enabled.", - default=None) - parser.add_argument("--column-id", help="Column name where the ids can be found." - "Only available when --csv-with-headers is enabled.", - default=None) - - parser.add_argument('--csv-with-headers', action='store_true', help="") + parser = argparse.ArgumentParser( + description="Extract features from time series stored in a CSV file and " + "write them back into another CSV file. The time series in the CSV " + "file should either have one of the dataframe-formats described in " + "http://tsfresh.readthedocs.io/en/latest/text/data_formats.html, " + "which means you have to supply the --csv-with-headers flag " + "or should be in the form " + "[time series 1 values ..., time series 2 values ...] " + "where you should not add the --csv-with-headers flag. " + "The CSV is expected to be space-separated." + ) + parser.add_argument( + "input_file_name", help="File name of the input CSV file to read in." + ) + parser.add_argument( + "--output-file-name", + help="File name of the output CSV file to write to. " + "Defaults to input_file_name.features.csv", + default=None, + ) + + parser.add_argument( + "--column-sort", + help="Column name to be used to sort the rows. " + "Only available when --csv-with-headers is enabled.", + default=None, + ) + parser.add_argument( + "--column-kind", + help="Column name where the kind column can be found." + "Only available when --csv-with-headers is enabled.", + default=None, + ) + parser.add_argument( + "--column-value", + help="Column name where the values can be found." + "Only available when --csv-with-headers is enabled.", + default=None, + ) + parser.add_argument( + "--column-id", + help="Column name where the ids can be found." + "Only available when --csv-with-headers is enabled.", + default=None, + ) + + parser.add_argument("--csv-with-headers", action="store_true", help="") print(console_args) args = parser.parse_args(console_args) - if (args.column_id or args.column_kind or args.column_sort or args.column_value) and (not args.csv_with_headers): - raise AttributeError("You can only pass in column-value, column-kind, column-id or column-sort if " - "--csv-with-headers is enabled.") + if ( + args.column_id or args.column_kind or args.column_sort or args.column_value + ) and (not args.csv_with_headers): + raise AttributeError( + "You can only pass in column-value, column-kind, column-id or column-sort if " + "--csv-with-headers is enabled." + ) if args.csv_with_headers: column_kind = args.column_kind @@ -98,18 +121,22 @@ def main(console_args=None): if not args.csv_with_headers: df = _preprocess(df) - df_features = extract_features(df, column_kind=column_kind, - column_sort=column_sort, column_value=column_value, - column_id=column_id) + df_features = extract_features( + df, + column_kind=column_kind, + column_sort=column_sort, + column_value=column_value, + column_id=column_id, + ) # re-cast index from float to int - df_features.index = df_features.index.astype('int') + df_features.index = df_features.index.astype("int") # write to disk - default_out_file_name = os.path.splitext(input_file_name)[0] + '.features.csv' + default_out_file_name = os.path.splitext(input_file_name)[0] + ".features.csv" output_file_name = args.output_file_name or default_out_file_name df_features.to_csv(output_file_name) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tsfresh/scripts/test_timing.py b/tsfresh/scripts/test_timing.py index 5db8e0e22..b15f38a97 100644 --- a/tsfresh/scripts/test_timing.py +++ b/tsfresh/scripts/test_timing.py @@ -11,7 +11,7 @@ def test_with_length(length, df): from tsfresh import extract_features start = time.time() - df = extract_features(df[:length], column_id='id', column_sort='time') + df = extract_features(df[:length], column_id="id", column_sort="time") end = time.time() duration = end - start @@ -19,7 +19,15 @@ def test_with_length(length, df): print("Some checks with length", length) print(100 * duration) print(len(df.columns), len(df)) - print(df[["a__abs_energy", "b__absolute_sum_of_changes", "f__time_reversal_asymmetry_statistic__lag_1"]].head()) + print( + df[ + [ + "a__abs_energy", + "b__absolute_sum_of_changes", + "f__time_reversal_asymmetry_statistic__lag_1", + ] + ].head() + ) return {"length": length, "duration": duration} @@ -29,7 +37,11 @@ def plot_results(): plt.figure(figsize=(7, 7)) - baseline = pd.read_csv("a57a09fe62a62fe0d2564a056f7fd99f58822312.dat").groupby("length").duration.mean() + baseline = ( + pd.read_csv("a57a09fe62a62fe0d2564a056f7fd99f58822312.dat") + .groupby("length") + .duration.mean() + ) for file_name in glob("*.dat"): df = pd.read_csv(file_name).groupby("length").duration.mean() @@ -55,12 +67,20 @@ def plot_results(): def test_timing(): - from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, \ - load_robot_execution_failures + from tsfresh.examples.robot_execution_failures import ( + download_robot_execution_failures, + load_robot_execution_failures, + ) + download_robot_execution_failures() df, y = load_robot_execution_failures() - commit_hash = check_output(["git", "log", "--format=\"%H\"", "-1"]).decode("ascii").strip().replace("\"", "") + commit_hash = ( + check_output(["git", "log", '--format="%H"', "-1"]) + .decode("ascii") + .strip() + .replace('"', "") + ) lengths_to_test = [1, 5, 10, 60, 100, 400, 600, 1000, 2000] results = [] diff --git a/tsfresh/transformers/feature_augmenter.py b/tsfresh/transformers/feature_augmenter.py index f8da619ef..48b45bfc4 100644 --- a/tsfresh/transformers/feature_augmenter.py +++ b/tsfresh/transformers/feature_augmenter.py @@ -60,17 +60,24 @@ class FeatureAugmenter(BaseEstimator, TransformerMixin): :mod:`~tsfresh.feature_extraction.extraction`. """ - def __init__(self, default_fc_parameters=None, - kind_to_fc_parameters=None, column_id=None, column_sort=None, - column_kind=None, column_value=None, timeseries_container=None, - chunksize=tsfresh.defaults.CHUNKSIZE, - n_jobs=tsfresh.defaults.N_PROCESSES, show_warnings=tsfresh.defaults.SHOW_WARNINGS, - disable_progressbar=tsfresh.defaults.DISABLE_PROGRESSBAR, - impute_function=tsfresh.defaults.IMPUTE_FUNCTION, - profile=tsfresh.defaults.PROFILING, - profiling_filename=tsfresh.defaults.PROFILING_FILENAME, - profiling_sorting=tsfresh.defaults.PROFILING_SORTING - ): + def __init__( + self, + default_fc_parameters=None, + kind_to_fc_parameters=None, + column_id=None, + column_sort=None, + column_kind=None, + column_value=None, + timeseries_container=None, + chunksize=tsfresh.defaults.CHUNKSIZE, + n_jobs=tsfresh.defaults.N_PROCESSES, + show_warnings=tsfresh.defaults.SHOW_WARNINGS, + disable_progressbar=tsfresh.defaults.DISABLE_PROGRESSBAR, + impute_function=tsfresh.defaults.IMPUTE_FUNCTION, + profile=tsfresh.defaults.PROFILING, + profiling_filename=tsfresh.defaults.PROFILING_FILENAME, + profiling_sorting=tsfresh.defaults.PROFILING_SORTING, + ): """ Create a new FeatureAugmenter instance. :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names @@ -192,24 +199,35 @@ def transform(self, X): :rtype: pandas.DataFrame """ if self.timeseries_container is None: - raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") + raise RuntimeError( + "You have to provide a time series using the set_timeseries_container function before." + ) # Extract only features for the IDs in X.index - timeseries_container_X = restrict_input_to_index(self.timeseries_container, self.column_id, X.index) - - extracted_features = extract_features(timeseries_container_X, - default_fc_parameters=self.default_fc_parameters, - kind_to_fc_parameters=self.kind_to_fc_parameters, - column_id=self.column_id, column_sort=self.column_sort, - column_kind=self.column_kind, column_value=self.column_value, - chunksize=self.chunksize, - n_jobs=self.n_jobs, show_warnings=self.show_warnings, - disable_progressbar=self.disable_progressbar, - impute_function=self.impute_function, - profile=self.profile, - profiling_filename=self.profiling_filename, - profiling_sorting=self.profiling_sorting) - - X = pd.merge(X, extracted_features, left_index=True, right_index=True, how="left") + timeseries_container_X = restrict_input_to_index( + self.timeseries_container, self.column_id, X.index + ) + + extracted_features = extract_features( + timeseries_container_X, + default_fc_parameters=self.default_fc_parameters, + kind_to_fc_parameters=self.kind_to_fc_parameters, + column_id=self.column_id, + column_sort=self.column_sort, + column_kind=self.column_kind, + column_value=self.column_value, + chunksize=self.chunksize, + n_jobs=self.n_jobs, + show_warnings=self.show_warnings, + disable_progressbar=self.disable_progressbar, + impute_function=self.impute_function, + profile=self.profile, + profiling_filename=self.profiling_filename, + profiling_sorting=self.profiling_sorting, + ) + + X = pd.merge( + X, extracted_features, left_index=True, right_index=True, how="left" + ) return X diff --git a/tsfresh/transformers/per_column_imputer.py b/tsfresh/transformers/per_column_imputer.py index 7a201867c..5e063e801 100644 --- a/tsfresh/transformers/per_column_imputer.py +++ b/tsfresh/transformers/per_column_imputer.py @@ -4,7 +4,10 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.exceptions import NotFittedError -from tsfresh.utilities.dataframe_functions import get_range_values_per_column, impute_dataframe_range +from tsfresh.utilities.dataframe_functions import ( + get_range_values_per_column, + impute_dataframe_range, +) import pandas as pd @@ -26,7 +29,12 @@ class PerColumnImputer(BaseEstimator, TransformerMixin): the column-wise computed min, max and median values. """ - def __init__(self, col_to_NINF_repl_preset=None, col_to_PINF_repl_preset=None, col_to_NAN_repl_preset=None): + def __init__( + self, + col_to_NINF_repl_preset=None, + col_to_PINF_repl_preset=None, + col_to_NAN_repl_preset=None, + ): """ Create a new PerColumnImputer instance, optionally with dictionaries containing replacements for ``NaNs`` and ``infs``. @@ -65,22 +73,28 @@ def fit(self, X, y=None): if self.col_to_NINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()): - raise ValueError("Preset dictionary 'col_to_NINF_repl_preset' contain more keys " - "than the column names in X") + raise ValueError( + "Preset dictionary 'col_to_NINF_repl_preset' contain more keys " + "than the column names in X" + ) col_to_min.update(self.col_to_NINF_repl_preset) self._col_to_NINF_repl = col_to_min if self.col_to_PINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()): - raise ValueError("Preset dictionary 'col_to_PINF_repl_preset' contain more keys " - "than the column names in X") + raise ValueError( + "Preset dictionary 'col_to_PINF_repl_preset' contain more keys " + "than the column names in X" + ) col_to_max.update(self.col_to_PINF_repl_preset) self._col_to_PINF_repl = col_to_max if self.col_to_NAN_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()): - raise ValueError("Preset dictionary 'col_to_NAN_repl_preset' contain more keys " - "than the column names in X") + raise ValueError( + "Preset dictionary 'col_to_NAN_repl_preset' contain more keys " + "than the column names in X" + ) col_to_median.update(self.col_to_NAN_repl_preset) self._col_to_NAN_repl = col_to_median @@ -103,9 +117,15 @@ def transform(self, X): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) - if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None: + if ( + self._col_to_NINF_repl is None + or self._col_to_PINF_repl is None + or self._col_to_NAN_repl is None + ): raise NotFittedError("PerColumnImputer is not fitted") - X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl) + X = impute_dataframe_range( + X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl + ) return X diff --git a/tsfresh/transformers/relevant_feature_augmenter.py b/tsfresh/transformers/relevant_feature_augmenter.py index 7eb1c69ed..0e0c00277 100644 --- a/tsfresh/transformers/relevant_feature_augmenter.py +++ b/tsfresh/transformers/relevant_feature_augmenter.py @@ -9,7 +9,10 @@ from tsfresh.feature_extraction.settings import from_columns from tsfresh.transformers.feature_augmenter import FeatureAugmenter from tsfresh.transformers.feature_selector import FeatureSelector -from tsfresh.utilities.dataframe_functions import impute_dataframe_range, get_range_values_per_column +from tsfresh.utilities.dataframe_functions import ( + impute_dataframe_range, + get_range_values_per_column, +) # Pro: It offers more control @@ -90,7 +93,10 @@ def __init__( filter_only_tsfresh_features=True, default_fc_parameters=None, kind_to_fc_parameters=None, - column_id=None, column_sort=None, column_kind=None, column_value=None, + column_id=None, + column_sort=None, + column_kind=None, + column_value=None, timeseries_container=None, chunksize=defaults.CHUNKSIZE, n_jobs=defaults.N_PROCESSES, @@ -226,7 +232,9 @@ def __init__( self.profile = profile self.profiling_filename = profiling_filename self.profiling_sorting = profiling_sorting - self.test_for_binary_target_binary_feature = test_for_binary_target_binary_feature + self.test_for_binary_target_binary_feature = ( + test_for_binary_target_binary_feature + ) self.test_for_binary_target_real_feature = test_for_binary_target_real_feature self.test_for_real_target_binary_feature = test_for_real_target_binary_feature self.test_for_real_target_real_feature = test_for_real_target_real_feature @@ -298,7 +306,9 @@ def transform(self, X): """ if self.timeseries_container is None: - raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") + raise RuntimeError( + "You have to provide a time series using the set_timeseries_container function before." + ) if self.feature_selector is None: raise RuntimeError("You have to call fit before calling transform.") @@ -308,34 +318,46 @@ def transform(self, X): self.feature_extractor.set_timeseries_container(self.timeseries_container) - relevant_time_series_features = set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns) + relevant_time_series_features = set( + self.feature_selector.relevant_features + ) - set(pd.DataFrame(X).columns) relevant_extraction_settings = from_columns(relevant_time_series_features) # Set imputing strategy - impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max, - col_to_min=self.col_to_min, col_to_median=self.col_to_median) - - relevant_feature_extractor = FeatureAugmenter(kind_to_fc_parameters=relevant_extraction_settings, - default_fc_parameters={}, - column_id=self.feature_extractor.column_id, - column_sort=self.feature_extractor.column_sort, - column_kind=self.feature_extractor.column_kind, - column_value=self.feature_extractor.column_value, - chunksize=self.feature_extractor.chunksize, - n_jobs=self.feature_extractor.n_jobs, - show_warnings=self.feature_extractor.show_warnings, - disable_progressbar=self.feature_extractor.disable_progressbar, - impute_function=impute_function, - profile=self.feature_extractor.profile, - profiling_filename=self.feature_extractor.profiling_filename, - profiling_sorting=self.feature_extractor.profiling_sorting) - - relevant_feature_extractor.set_timeseries_container(self.feature_extractor.timeseries_container) + impute_function = partial( + impute_dataframe_range, + col_to_max=self.col_to_max, + col_to_min=self.col_to_min, + col_to_median=self.col_to_median, + ) + + relevant_feature_extractor = FeatureAugmenter( + kind_to_fc_parameters=relevant_extraction_settings, + default_fc_parameters={}, + column_id=self.feature_extractor.column_id, + column_sort=self.feature_extractor.column_sort, + column_kind=self.feature_extractor.column_kind, + column_value=self.feature_extractor.column_value, + chunksize=self.feature_extractor.chunksize, + n_jobs=self.feature_extractor.n_jobs, + show_warnings=self.feature_extractor.show_warnings, + disable_progressbar=self.feature_extractor.disable_progressbar, + impute_function=impute_function, + profile=self.feature_extractor.profile, + profiling_filename=self.feature_extractor.profiling_filename, + profiling_sorting=self.feature_extractor.profiling_sorting, + ) + + relevant_feature_extractor.set_timeseries_container( + self.feature_extractor.timeseries_container + ) X_augmented = relevant_feature_extractor.transform(X) if self.filter_only_tsfresh_features: - return X_augmented.copy().loc[:, self.feature_selector.relevant_features + X.columns.tolist()] + return X_augmented.copy().loc[ + :, self.feature_selector.relevant_features + X.columns.tolist() + ] else: return X_augmented.copy().loc[:, self.feature_selector.relevant_features] @@ -357,10 +379,14 @@ def fit_transform(self, X, y): """ X_augmented = self._fit_and_augment(X, y) - selected_features = X_augmented.copy().loc[:, self.feature_selector.relevant_features] + selected_features = X_augmented.copy().loc[ + :, self.feature_selector.relevant_features + ] if self.filter_only_tsfresh_features: - selected_features = pd.merge(selected_features, X, left_index=True, right_index=True, how="left") + selected_features = pd.merge( + selected_features, X, left_index=True, right_index=True, how="left" + ) return selected_features @@ -381,7 +407,9 @@ def _fit_and_augment(self, X, y): :rtype: pandas.DataFrame """ if self.timeseries_container is None: - raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") + raise RuntimeError( + "You have to provide a time series using the set_timeseries_container function before." + ) self.feature_extractor = FeatureAugmenter( default_fc_parameters=self.default_fc_parameters, @@ -397,7 +425,7 @@ def _fit_and_augment(self, X, y): disable_progressbar=self.disable_progressbar, profile=self.profile, profiling_filename=self.profiling_filename, - profiling_sorting=self.profiling_sorting + profiling_sorting=self.profiling_sorting, ) self.feature_selector = FeatureSelector( @@ -423,9 +451,17 @@ def _fit_and_augment(self, X, y): X_augmented = self.feature_extractor.transform(X_tmp) - self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(X_augmented) - X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, - col_to_min=self.col_to_min) + ( + self.col_to_max, + self.col_to_min, + self.col_to_median, + ) = get_range_values_per_column(X_augmented) + X_augmented = impute_dataframe_range( + X_augmented, + col_to_max=self.col_to_max, + col_to_median=self.col_to_median, + col_to_min=self.col_to_min, + ) self.feature_selector.fit(X_augmented, y) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index c1db2d725..fcf427a3c 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -11,7 +11,11 @@ import pandas as pd from tsfresh import defaults -from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor, DistributorBaseClass +from tsfresh.utilities.distribution import ( + MapDistributor, + MultiprocessingDistributor, + DistributorBaseClass, +) def check_for_nans_in_columns(df, columns=None): @@ -33,8 +37,13 @@ def check_for_nans_in_columns(df, columns=None): if pd.isnull(df.loc[:, columns]).any().any(): if not isinstance(columns, list): columns = list(columns) - raise ValueError("Columns {} of DataFrame must not contain NaN values".format( - df.loc[:, columns].columns[pd.isnull(df.loc[:, columns]).sum() > 0].tolist())) + raise ValueError( + "Columns {} of DataFrame must not contain NaN values".format( + df.loc[:, columns] + .columns[pd.isnull(df.loc[:, columns]).sum() > 0] + .tolist() + ) + ) def impute(df_impute): @@ -128,23 +137,33 @@ def impute_dataframe_range(df_impute, col_to_max, col_to_min, col_to_median): columns = df_impute.columns # Making sure col_to_median, col_to_max and col_to_min have entries for every column - if not set(columns) <= set(col_to_median.keys()) or \ - not set(columns) <= set(col_to_max.keys()) or \ - not set(columns) <= set(col_to_min.keys()): - raise ValueError("Some of the dictionaries col_to_median, col_to_max, col_to_min contains more or less keys " - "than the column names in df") + if ( + not set(columns) <= set(col_to_median.keys()) + or not set(columns) <= set(col_to_max.keys()) + or not set(columns) <= set(col_to_min.keys()) + ): + raise ValueError( + "Some of the dictionaries col_to_median, col_to_max, col_to_min contains more or less keys " + "than the column names in df" + ) # check if there are non finite values for the replacement - if np.any(~np.isfinite(list(col_to_median.values()))) or \ - np.any(~np.isfinite(list(col_to_min.values()))) or \ - np.any(~np.isfinite(list(col_to_max.values()))): - raise ValueError("Some of the dictionaries col_to_median, col_to_max, col_to_min contains non finite values " - "to replace") + if ( + np.any(~np.isfinite(list(col_to_median.values()))) + or np.any(~np.isfinite(list(col_to_min.values()))) + or np.any(~np.isfinite(list(col_to_max.values()))) + ): + raise ValueError( + "Some of the dictionaries col_to_median, col_to_max, col_to_min contains non finite values " + "to replace" + ) # Make the replacement dataframes as large as the real one col_to_max = pd.DataFrame([col_to_max] * len(df_impute), index=df_impute.index) col_to_min = pd.DataFrame([col_to_min] * len(df_impute), index=df_impute.index) - col_to_median = pd.DataFrame([col_to_median] * len(df_impute), index=df_impute.index) + col_to_median = pd.DataFrame( + [col_to_median] * len(df_impute), index=df_impute.index + ) df_impute.where(df_impute.values != np.PINF, other=col_to_max, inplace=True) df_impute.where(df_impute.values != np.NINF, other=col_to_min, inplace=True) @@ -176,8 +195,12 @@ def get_range_values_per_column(df): if np.any(is_col_non_finite): # We have columns that does not contain any finite value at all, so we will store 0 instead. - warnings.warn("The columns {} did not have any finite values. Filling with zeros.".format( - df.iloc[:, np.where(is_col_non_finite)[0]].columns.values), RuntimeWarning) + warnings.warn( + "The columns {} did not have any finite values. Filling with zeros.".format( + df.iloc[:, np.where(is_col_non_finite)[0]].columns.values + ), + RuntimeWarning, + ) masked.data[:, is_col_non_finite] = 0 # Set the values of the columns to 0 masked.mask[:, is_col_non_finite] = False # Remove the mask for this column @@ -216,8 +239,10 @@ def restrict_input_to_index(df_or_dict, column_id, index): df_or_dict_restricted = df_or_dict[df_or_dict[column_id].isin(index)] elif isinstance(df_or_dict, dict): - df_or_dict_restricted = {kind: restrict_input_to_index(df, column_id, index) - for kind, df in df_or_dict.items()} + df_or_dict_restricted = { + kind: restrict_input_to_index(df, column_id, index) + for kind, df in df_or_dict.items() + } else: raise TypeError("df_or_dict should be of type dict or pandas.DataFrame") @@ -246,8 +271,15 @@ def get_ids(df_or_dict, column_id): raise TypeError("df_or_dict should be of type dict or pandas.DataFrame") -def _roll_out_time_series(timeshift, grouped_data, rolling_direction, max_timeshift, min_timeshift, - column_sort, column_id): +def _roll_out_time_series( + timeshift, + grouped_data, + rolling_direction, + max_timeshift, + min_timeshift, + column_sort, + column_id, +): """ Internal helper function for roll_time_series. This function has the task to extract the rolled forecast data frame of the number `timeshift`. @@ -283,6 +315,7 @@ def _roll_out_time_series(timeshift, grouped_data, rolling_direction, max_timesh window from the back (but it is implemented to start counting from the beginning). """ + def _f(x): if rolling_direction > 0: # For positive rolling, the right side of the window moves with `timeshift` @@ -317,12 +350,20 @@ def _f(x): return [grouped_data.apply(_f)] -def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, - rolling_direction=1, max_timeshift=None, min_timeshift=0, - chunksize=defaults.CHUNKSIZE, - n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, - disable_progressbar=defaults.DISABLE_PROGRESSBAR, - distributor=None): +def roll_time_series( + df_or_dict, + column_id, + column_sort=None, + column_kind=None, + rolling_direction=1, + max_timeshift=None, + min_timeshift=0, + chunksize=defaults.CHUNKSIZE, + n_jobs=defaults.N_PROCESSES, + show_warnings=defaults.SHOW_WARNINGS, + disable_progressbar=defaults.DISABLE_PROGRESSBAR, + distributor=None, +): """ This method creates sub windows of the time series. It rolls the (sorted) data frames for each kind and each id separately in the "time" domain (which is represented by the sort order of the sort column given by `column_sort`). @@ -411,38 +452,52 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, if isinstance(df_or_dict, dict): if column_kind is not None: - raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.") - - return {key: roll_time_series(df_or_dict=df_or_dict[key], - column_id=column_id, - column_sort=column_sort, - column_kind=column_kind, - rolling_direction=rolling_direction, - max_timeshift=max_timeshift, - min_timeshift=min_timeshift, - chunksize=chunksize, - n_jobs=n_jobs, - show_warnings=show_warnings, - disable_progressbar=disable_progressbar, - distributor=distributor) - for key in df_or_dict} + raise ValueError( + "You passed in a dictionary and gave a column name for the kind. Both are not possible." + ) + + return { + key: roll_time_series( + df_or_dict=df_or_dict[key], + column_id=column_id, + column_sort=column_sort, + column_kind=column_kind, + rolling_direction=rolling_direction, + max_timeshift=max_timeshift, + min_timeshift=min_timeshift, + chunksize=chunksize, + n_jobs=n_jobs, + show_warnings=show_warnings, + disable_progressbar=disable_progressbar, + distributor=distributor, + ) + for key in df_or_dict + } # Now we know that this is a pandas data frame df = df_or_dict if len(df) <= 1: - raise ValueError("Your time series container has zero or one rows!. Can not perform rolling.") + raise ValueError( + "Your time series container has zero or one rows!. Can not perform rolling." + ) if column_id is not None: if column_id not in df: - raise AttributeError("The given column for the id is not present in the data.") + raise AttributeError( + "The given column for the id is not present in the data." + ) else: - raise ValueError("You have to set the column_id which contains the ids of the different time series") + raise ValueError( + "You have to set the column_id which contains the ids of the different time series" + ) if column_kind is not None: grouper = [column_kind, column_id] else: - grouper = [column_id, ] + grouper = [ + column_id, + ] if column_sort is not None: # Require no Nans in column @@ -456,13 +511,16 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, # Build the differences between consecutive time sort values differences = df.groupby(grouper)[column_sort].apply( - lambda x: x.values[:-1] - x.values[1:]) + lambda x: x.values[:-1] - x.values[1:] + ) # Write all of them into one big list differences = sum(map(list, differences), []) # Test if all differences are the same if differences and min(differences) != max(differences): - warnings.warn("Your time stamps are not uniformly sampled, which makes rolling " - "nonsensical in some domains.") + warnings.warn( + "Your time stamps are not uniformly sampled, which makes rolling " + "nonsensical in some domains." + ) # Roll the data frames if requested rolling_amount = np.abs(rolling_direction) @@ -484,13 +542,16 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, if distributor is None: if n_jobs == 0 or n_jobs == 1: - distributor = MapDistributor(disable_progressbar=disable_progressbar, - progressbar_title="Rolling") + distributor = MapDistributor( + disable_progressbar=disable_progressbar, progressbar_title="Rolling" + ) else: - distributor = MultiprocessingDistributor(n_workers=n_jobs, - disable_progressbar=disable_progressbar, - progressbar_title="Rolling", - show_warnings=show_warnings) + distributor = MultiprocessingDistributor( + n_workers=n_jobs, + disable_progressbar=disable_progressbar, + progressbar_title="Rolling", + show_warnings=show_warnings, + ) if not isinstance(distributor, DistributorBaseClass): raise ValueError("the passed distributor is not an DistributorBaseClass object") @@ -504,8 +565,12 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, "column_id": column_id, } - shifted_chunks = distributor.map_reduce(_roll_out_time_series, data=range_of_shifts, - chunk_size=chunksize, function_kwargs=kwargs) + shifted_chunks = distributor.map_reduce( + _roll_out_time_series, + data=range_of_shifts, + chunk_size=chunksize, + function_kwargs=kwargs, + ) distributor.close() @@ -549,17 +614,16 @@ def make_forecasting_frame(x, kind, max_timeshift, rolling_direction): else: t = range(n) - df = pd.DataFrame({"id": ["id"] * n, - "time": t, - "value": x, - "kind": kind}) + df = pd.DataFrame({"id": ["id"] * n, "time": t, "value": x, "kind": kind}) - df_shift = roll_time_series(df, - column_id="id", - column_sort="time", - column_kind="kind", - rolling_direction=rolling_direction, - max_timeshift=max_timeshift) + df_shift = roll_time_series( + df, + column_id="id", + column_sort="time", + column_kind="kind", + rolling_direction=rolling_direction, + max_timeshift=max_timeshift, + ) # drop the rows which should actually be predicted def mask_first(x): @@ -570,7 +634,7 @@ def mask_first(x): result[-1] = 0 return result - mask = df_shift.groupby(['id'])['id'].transform(mask_first).astype(bool) + mask = df_shift.groupby(["id"])["id"].transform(mask_first).astype(bool) df_shift = df_shift[mask] # Now create the target vector out of the values @@ -585,7 +649,9 @@ def mask_first(x): return df_shift, y -def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None): +def add_sub_time_series_index( + df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None +): """ Add a column "id" which contains: @@ -618,14 +684,20 @@ def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sor if isinstance(df_or_dict, dict): if column_kind is not None: - raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.") - - return {key: add_sub_time_series_index(df_or_dict=df_or_dict[key], - sub_length=sub_length, - column_id=column_id, - column_sort=column_sort, - column_kind=column_kind) - for key in df_or_dict} + raise ValueError( + "You passed in a dictionary and gave a column name for the kind. Both are not possible." + ) + + return { + key: add_sub_time_series_index( + df_or_dict=df_or_dict[key], + sub_length=sub_length, + column_id=column_id, + column_sort=column_sort, + column_kind=column_kind, + ) + for key in df_or_dict + } df = df_or_dict @@ -641,9 +713,13 @@ def _add_id_column(df_chunk): last_chunk_number = chunk_length // sub_length reminder = chunk_length % sub_length - indices = np.concatenate([np.repeat(np.arange(last_chunk_number), sub_length), - np.repeat(last_chunk_number, reminder)]) - assert(len(indices) == chunk_length) + indices = np.concatenate( + [ + np.repeat(np.arange(last_chunk_number), sub_length), + np.repeat(last_chunk_number, reminder), + ] + ) + assert len(indices) == chunk_length if column_id: indices = list(zip(indices, df_chunk[column_id])) diff --git a/tsfresh/utilities/distribution.py b/tsfresh/utilities/distribution.py index 94dc66787..841692bdb 100644 --- a/tsfresh/utilities/distribution.py +++ b/tsfresh/utilities/distribution.py @@ -70,7 +70,15 @@ class DistributorBaseClass: Dependent on the implementation of the distribute function, this is done in parallel or using a cluster of nodes. """ - def map_reduce(self, map_function, data, function_kwargs=None, chunk_size=None, data_length=None): + + def map_reduce( + self, + map_function, + data, + function_kwargs=None, + chunk_size=None, + data_length=None, + ): """ This method contains the core functionality of the DistributorBaseClass class. @@ -135,7 +143,9 @@ def partition(data, chunk_size): # (= we have reached the end) # The islice(iterator, n) gets the next n elements from the iterator. # The list(...) makes sure we do not pass - return takewhile(bool, (list(islice(iterator, chunk_size)) for _ in repeat(None))) + return takewhile( + bool, (list(islice(iterator, chunk_size)) for _ in repeat(None)) + ) def __init__(self): """ @@ -160,7 +170,14 @@ def calculate_best_chunk_size(self, data_length): chunk_size += 1 return chunk_size - def map_reduce(self, map_function, data, function_kwargs=None, chunk_size=None, data_length=None): + def map_reduce( + self, + map_function, + data, + function_kwargs=None, + chunk_size=None, + data_length=None, + ): """ This method contains the core functionality of the DistributorBaseClass class. @@ -190,7 +207,9 @@ def map_reduce(self, map_function, data, function_kwargs=None, chunk_size=None, :rtype: list """ if not isinstance(data, Iterable): - raise ValueError("You passed data, which can not be handled by this distributor!") + raise ValueError( + "You passed data, which can not be handled by this distributor!" + ) if data_length is None: data_length = len(data) @@ -204,11 +223,20 @@ def map_reduce(self, map_function, data, function_kwargs=None, chunk_size=None, if hasattr(self, "progressbar_title"): total_number_of_expected_results = math.ceil(data_length / chunk_size) - result = tqdm(self.distribute(_function_with_partly_reduce, chunk_generator, map_kwargs), - total=total_number_of_expected_results, - desc=self.progressbar_title, disable=self.disable_progressbar) + result = tqdm( + self.distribute( + _function_with_partly_reduce, chunk_generator, map_kwargs + ), + total=total_number_of_expected_results, + desc=self.progressbar_title, + disable=self.disable_progressbar, + ) else: - result = self.distribute(_function_with_partly_reduce, chunk_generator, map_kwargs), + result = ( + self.distribute( + _function_with_partly_reduce, chunk_generator, map_kwargs + ), + ) result = list(itertools.chain.from_iterable(result)) @@ -246,7 +274,9 @@ class MapDistributor(IterableDistributorBaseClass): Distributor using the python build-in map, which calculates each job sequentially one after the other. """ - def __init__(self, disable_progressbar=False, progressbar_title="Feature Extraction"): + def __init__( + self, disable_progressbar=False, progressbar_title="Feature Extraction" + ): """ Creates a new MapDistributor instance @@ -304,7 +334,9 @@ def __init__(self, n_workers): # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() - cluster = LocalCluster(n_workers=n_workers, processes=False, local_directory=self.local_dir_) + cluster = LocalCluster( + n_workers=n_workers, processes=False, local_directory=self.local_dir_ + ) self.client = Client(cluster) self.n_workers = n_workers @@ -329,7 +361,9 @@ def distribute(self, func, partitioned_chunks, kwargs): if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) - result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) + result = self.client.gather( + self.client.map(partial(func, **kwargs), partitioned_chunks) + ) return [item for sublist in result for item in sublist] def close(self): @@ -388,7 +422,9 @@ def distribute(self, func, partitioned_chunks, kwargs): if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) - result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) + result = self.client.gather( + self.client.map(partial(func, **kwargs), partitioned_chunks) + ) return [item for sublist in result for item in sublist] def close(self): @@ -403,8 +439,13 @@ class MultiprocessingDistributor(IterableDistributorBaseClass): Distributor using a multiprocessing Pool to calculate the jobs in parallel on the local machine. """ - def __init__(self, n_workers, disable_progressbar=False, progressbar_title="Feature Extraction", - show_warnings=True): + def __init__( + self, + n_workers, + disable_progressbar=False, + progressbar_title="Feature Extraction", + show_warnings=True, + ): """ Creates a new MultiprocessingDistributor instance @@ -417,7 +458,11 @@ def __init__(self, n_workers, disable_progressbar=False, progressbar_title="Feat :param show_warnings: whether to show warnings or not. :type show_warnings: bool """ - self.pool = Pool(processes=n_workers, initializer=initialize_warnings_in_workers, initargs=(show_warnings,)) + self.pool = Pool( + processes=n_workers, + initializer=initialize_warnings_in_workers, + initargs=(show_warnings,), + ) self.n_workers = n_workers self.disable_progressbar = disable_progressbar self.progressbar_title = progressbar_title @@ -452,5 +497,12 @@ class ApplyDistributor(DistributorBaseClass): def __init__(self, meta): self.meta = meta - def map_reduce(self, map_function, data, function_kwargs=None, chunk_size=None, data_length=None): + def map_reduce( + self, + map_function, + data, + function_kwargs=None, + chunk_size=None, + data_length=None, + ): return data.apply(map_function, meta=self.meta, **function_kwargs) diff --git a/tsfresh/utilities/profiling.py b/tsfresh/utilities/profiling.py index 072ae2833..8dbef1a04 100644 --- a/tsfresh/utilities/profiling.py +++ b/tsfresh/utilities/profiling.py @@ -17,6 +17,7 @@ # todo: tackle a debate about the need for this profiler # todo: we need unit tests for the profiling routine + def start_profiling(): """ Helper function to start the profiling process and return the profiler (to close it later). @@ -62,5 +63,7 @@ def end_profiling(profiler, filename, sorting=None): ps.print_stats() with open(filename, "w+") as f: - _logger.info("[calculate_ts_features] Finished profiling of time series feature extraction") + _logger.info( + "[calculate_ts_features] Finished profiling of time series feature extraction" + ) f.write(s.getvalue()) diff --git a/tsfresh/utilities/string_manipulation.py b/tsfresh/utilities/string_manipulation.py index 170307ad0..b35fa2703 100644 --- a/tsfresh/utilities/string_manipulation.py +++ b/tsfresh/utilities/string_manipulation.py @@ -67,4 +67,7 @@ def add_parenthesis_if_string_value(x): else: return str(x) - return "__".join(str(key) + "_" + add_parenthesis_if_string_value(param[key]) for key in sorted(param.keys())) + return "__".join( + str(key) + "_" + add_parenthesis_if_string_value(param[key]) + for key in sorted(param.keys()) + )