From 2b83413eeb07d88dec2c1a25ac4f1a70e825c94a Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Tue, 26 Nov 2019 21:30:17 +0100 Subject: [PATCH 1/4] Refactor .travis.yml part II (#605) * Refactor .travis.yml Only build coverage with Python 3.7 and latest package versions, this allows to run test stages in parallel. Add python 3.8 but allow it to fail, some dependencies are not yet ready. * Add comments and merge before_script into install * Fix test coverage stage in .travis.yml * Fix install phase, it is run during all stages * Fix TRAVIS_JOB_ID when running coveralls We want to scan output of 1st job. * Do not call coveralls into a seperate stage Previous commit did not work, there was some confusion between TRAVIS_JOB_ID and TRAVIS_JOB_NUMBER. I do not see how to get TRAVIS_JOB_ID for the right job, thus call coveralls inside this job. * Some reordering and adding more comments --- .travis.yml | 167 ++++++++++++++++++++++------------------------------ 1 file changed, 69 insertions(+), 98 deletions(-) diff --git a/.travis.yml b/.travis.yml index a20e49d1c..765a6c87e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,123 +1,94 @@ -# Travis Build file for tsfresh +## Travis Build file for tsfresh + language: python -# We want the pip folder to be cached, to speed up installation +os: linux +notifications: + slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq + +# We want the pip folder to be cached to speed up installation cache: directories: - $HOME/.cache/pip + +# Installation of packages install: + # Begin by updating pip to its newest version - pip install --upgrade pip wheel setuptools + # Then install the requirements as they are defined - pip install -r requirements.txt -r test-requirements.txt - - pip install -U . - - pip freeze -jobs: - include: - - stage: Run tests on newest set of dependencies (Python 3.5.3) - env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest" - before_script: - - pip install --upgrade numpy - - pip install --upgrade pandas - - pip install --upgrade scikit-learn - - pip install --upgrade dask - - pip install --upgrade distributed - - pip install --upgrade scipy - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" - python: 3.5.3 + # Then install the package + - pip install . + # Now downgrade packages if required by environment variables + - "[ \"$NUMPY\" = latest ] && pip install --upgrade numpy || [ -z \"$NUMPY\" ] || pip install numpy==$NUMPY" + - "[ \"$PANDAS\" = latest ] && pip install --upgrade pandas || [ -z \"$PANDAS\" ] || pip install pandas==$PANDAS" + - "[ \"$SCIKITLEARN\" = latest ] && pip install --upgrade scikit-learn || [ -z \"$SCIKITLEARN\" ] || pip install scikit-learn==$SCIKITLEARN" + - "[ \"$DASK\" = latest ] && pip install --upgrade dask || [ -z \"$DASK\" ] || pip install dask==$DASK" + - "[ \"$DISTRIBUTED\" = latest ] && pip install --upgrade distributed || [ -z \"$DISTRIBUTED\" ] || pip install distributed==$DISTRIBUTED" + # need to downgrade tornado manually + - "[ \"$SCIPY\" = latest ] || [ -z \"$SCIPY\" ] || pip install tornado==4.5.3" + - "[ \"$SCIPY\" = latest ] && pip install --upgrade scipy || [ -z \"$SCIPY\" ] || pip install scipy==$SCIPY" + # Print out the pip versions for debugging + - pip list + # Only use two cores + - sed -i -e 's/-n auto/-n 2/g' setup.cfg + # we want to run coverage only inside a single job + - sed -e '/^\s*--cov tsfresh/d' setup.cfg > setup-nocov.cfg - - stage: Run tests on newest set of dependencies (Python 3.6) - env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest" - before_script: - - pip install --upgrade numpy - - pip install --upgrade pandas - - pip install --upgrade scikit-learn - - pip install --upgrade dask - - pip install --upgrade distributed - - pip install --upgrade scipy - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" - python: 3.6 +# all jobs call pytest with setup-nocov.cfg to not run coverage test - except one which overrides this variable +env: + global: + - PYTEST_ADDOPTS="-c setup-nocov.cfg" +# The script to call on tests +script: "if [ $TRAVIS_PULL_REQUEST = false ] && ! [ $TRAVIS_BRANCH = master ]; then pytest tests/units; else pytest tests; fi" + +# Now define the different stages +jobs: + include: + # First stage: tests + - stage: Run tests + env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" + python: 3.8 - - stage: Run tests on newest set of dependencies (Python 3.7) - env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest" - before_script: - - pip install --upgrade numpy - - pip install --upgrade pandas - - pip install --upgrade scikit-learn - - pip install --upgrade dask - - pip install --upgrade distributed - - pip install --upgrade scipy - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" + - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" PYTEST_ADDOPTS="-c setup.cfg" python: 3.7 + # We only run coverage tests here (we also set a different setup.cfg script here) after_success: - coveralls - - stage: Run tests on oldest set of dependencies (Python 3.5.3) - env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0" - before_script: - - pip install numpy==1.12.0 # First version with official python 3.6 support. - - pip install pandas==0.20.3 - - pip install scikit-learn==0.19.0 - - pip install dask==0.15.2 - - pip install distributed==1.18.3 - # need to downgrade tornado manually - - pip install tornado==4.5.3 - - pip install scipy==1.2.0 - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" + - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" + python: 3.6 + + - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" + # newest pandas (>= 0.25) requires python >= 3.5.3 python: 3.5.3 - - stage: Run tests on oldest set of dependencies (Python 3.6) - env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0" - before_script: - - pip install numpy==1.12.0 # First version with official python 3.6 support. - - pip install pandas==0.20.3 - - pip install scikit-learn==0.19.0 - - pip install dask==0.15.2 - - pip install distributed==1.18.3 - # need to downgrade tornado manually - - pip install tornado==4.5.3 - - pip install scipy==1.2.0 - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" + - env: NUMPY="1.15.1" PANDAS="0.23.2" SCIKITLEARN="0.19.2" DASK="0.16.1" DISTRIBUTED="1.18.3" SCIPY="1.2.0" + # python 3.7 requires pandas >= 0.23.2 + python: 3.7 + + - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" python: 3.6 - - stage: Run tests on oldest set of dependencies (Python 3.7) - env: NUMPY="1.15.1", PANDAS="0.23.2", SCIKITLEARN="0.19.2", DASK="0.16.1", DISTRIBUTED="1.18.3", SCIPY="1.2.0" - before_script: - - pip install numpy==1.15.1 # First version with official python 3.7 support. - - pip install pandas==0.23.2 # First version with official python 3.7 support. - - pip install scikit-learn==0.19.2 # First version with python 3.7 support. - - pip install dask==0.16.1 - - pip install distributed==1.18.3 - # need to downgrade tornado manually - - pip install tornado==4.5.3 - - pip install scipy==1.2.0 - - pip list - script: - - sed -i 's/\-n auto/\-n 2/g' setup.cfg - - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi" - python: 3.7 + - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" + python: 3.5.3 + + # Second stage: deploy + - stage: Deploy + if: tag IS present AND fork = false + python: 3.7 + script: skip deploy: provider: pypi user: MaxChrist password: secure: Jh0Z69Mh+esOpegXyXoecFOkpMhaQaiJbQVEvVvQ2K1rCmCE20a19/TGfPUrynpqOYXZvvb5Ok6CtlzAi9J5huA3MRSf4iYPsUe8i7n0FK4JU5BP7VqM3/7cQZMdD5SeYFV3e3JURDcKYfoG7N+DNb+LfluYK5MBkRLhdEVRqSeHocPY4QRzzhJi1ljX99ThdRPrsQqYaD3tpZxJhbJDgHLtvMr39+407uQSDnubvFz3iu90DZiN2fIP5bEN6PDuaGXNZMA1p40DjSkGc7epg0U4vHn6CSya1nXlqjXUqXYJY5Ha2kbMAN7hfmU+gId09+FSHQRuanKJkRqSBksVgATCAeSAiqAe3EPAsG75ewhXDeusQZMzRy7DxQzjOJG9oIyWMVmZFlIoNlpg2eifN9uUc7FfyGHiVfWwUDslszpc/81hQViMPP0NoMAop4zcWR3ChCMnHMycPQEmWuV65WfL7yN6SuTokxSmepubPtFs+4UIlI0rgZWCHVIgGZqI8LFn958pLtpQ+32Ew8HGU3IiOfao9HbGreQ2Lgqo2L2EyNDWiHfJ3oZ1+6BP/1GqI6j7x7oPdwoE1jvY4CSC7iMAiieZNnrvywvmJpZB69CGefxQJzWcm+yD03QwNBFFaabCbKwbn+q3eUOUrPRuvTkhVLRWDxQNH/zaZyuZQ+Q= distributions: "sdist bdist_wheel" - on: - tags: true - repo: blue-yonder/tsfresh -notifications: - slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq + + # Some dependencies are not yet ready for Python 3.8 + allow_failures: + - python: 3.8 + # Make sure to not wait for Python 3.8 + fast_finish: true + \ No newline at end of file From ceb6bd0c3e437a44394098cda4225276f9316681 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Tue, 26 Nov 2019 22:16:04 +0100 Subject: [PATCH 2/4] Ignore .vscode (#608) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a372f01f7..8fe2f2ff4 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ __pycache__/* .pydevproject .settings .idea +.vscode # Package files *.egg From 0654cec5281fe82b517c96ddad8351c53e9c6db5 Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Thu, 28 Nov 2019 18:47:05 +0100 Subject: [PATCH 3/4] Fix docstring of c3 (#590) There were two issues: * sum index should start from 1 and not 0 * x[i+2*lag] should not be squared Fix #585. --- tsfresh/feature_extraction/feature_calculators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index f4e0b58a6..67c4881c0 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1404,13 +1404,13 @@ def c3(x, lag): .. math:: - \\frac{1}{n-2lag} \\sum_{i=0}^{n-2lag} x_{i + 2 \\cdot lag}^2 \\cdot x_{i + lag} \\cdot x_{i} + \\frac{1}{n-2lag} \\sum_{i=1}^{n-2lag} x_{i + 2 \\cdot lag} \\cdot x_{i + lag} \\cdot x_{i} which is .. math:: - \\mathbb{E}[L^2(X)^2 \\cdot L(X) \\cdot X] + \\mathbb{E}[L^2(X) \\cdot L(X) \\cdot X] where :math:`\\mathbb{E}` is the mean and :math:`L` is the lag operator. It was proposed in [1] as a measure of non linearity in the time series. From ea61188bdbb901ec2db4857a2d1aa716ed3ce5c9 Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Tue, 3 Dec 2019 21:11:58 +0100 Subject: [PATCH 4/4] Improve test coverage (#609) * Improve test coverage * Fix ClusterDaskDistributor with recent Dask Apply the same fix as with LocalDaskDistributor. Fix #575. --- .../units/feature_extraction/test_settings.py | 2 +- .../utilities/test_dataframe_functions.py | 17 +++++++ tests/units/utilities/test_distribution.py | 51 ++++++++++++++++++- tsfresh/utilities/distribution.py | 6 ++- 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/tests/units/feature_extraction/test_settings.py b/tests/units/feature_extraction/test_settings.py index dd2a2d3ec..e63ee95ba 100644 --- a/tests/units/feature_extraction/test_settings.py +++ b/tests/units/feature_extraction/test_settings.py @@ -20,7 +20,7 @@ class TestSettingsObject(TestCase): def test_from_column_raises_on_wrong_column_format(self): self.assertRaises(TypeError, from_columns, 42) - self.assertRaises(TypeError, from_columns, 42) + self.assertRaises(TypeError, from_columns, [42]) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index eee9143ba..dda9661a5 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -111,10 +111,18 @@ def test_with_wrong_input(self): self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df, "strange_id", "sort", "kind", "value") + test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}]) + self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df, + "id", "sort", "strange_kind", "value") + test_df = pd.DataFrame([{"id": np.NaN, "kind": "a", "value": 3, "sort": 1}]) self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df, "id", "sort", "kind", "value") + test_df = pd.DataFrame([{"id": 0, "kind": np.NaN, "value": 3, "sort": 1}]) + self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df, + "id", "sort", "kind", "value") + test_df = pd.DataFrame([{"id": 2}, {"id": 1}]) test_dict = {"a": test_df, "b": test_df} @@ -202,6 +210,11 @@ def test_with_wrong_input(self): column_sort="sort", column_kind="kind", rolling_direction=1) + self.assertRaises(ValueError, dataframe_functions.roll_time_series, + df_or_dict=test_df, column_id=None, + column_sort="sort", column_kind="kind", + rolling_direction=1) + test_df = {"a": pd.DataFrame([{"id": 0}])} self.assertRaises(ValueError, dataframe_functions.roll_time_series, df_or_dict=test_df, column_id="id", @@ -753,3 +766,7 @@ def test_get_id__correct_dict(self): df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4}) + + def test_get_id_wrong(self): + other_type = np.array([1, 2, 3]) + self.assertRaises(TypeError, get_ids, other_type, "id") diff --git a/tests/units/utilities/test_distribution.py b/tests/units/utilities/test_distribution.py index 6089092eb..0da8a2a50 100644 --- a/tests/units/utilities/test_distribution.py +++ b/tests/units/utilities/test_distribution.py @@ -5,15 +5,16 @@ from unittest import TestCase import numpy as np import pandas as pd +from distributed import LocalCluster, Client from tsfresh import extract_features -from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor +from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor, ClusterDaskDistributor from tests.fixtures import DataTestCase class MultiprocessingDistributorTestCase(TestCase): - def test_partion(self): + def test_partition(self): distributor = MultiprocessingDistributor(n_workers=1) @@ -82,3 +83,49 @@ def test_local_dask_cluster_extraction_two_worker(self): self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) +class ClusterDaskDistributorTestCase(DataTestCase): + + def test_dask_cluster_extraction_one_worker(self): + cluster = LocalCluster(n_workers=1, threads_per_worker=1, diagnostics_port=False) + client = Client(cluster) + address = client.scheduler_info()['address'] + Distributor = ClusterDaskDistributor(address=address) + + df = self.create_test_data_sample() + extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", + column_value="val", + distributor=Distributor) + + self.assertIsInstance(extracted_features, pd.DataFrame) + self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) + self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) + self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) + self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) + self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) + self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) + cluster.close() + + def test_dask_cluster_extraction_two_workers(self): + cluster = LocalCluster(n_workers=2, threads_per_worker=1, diagnostics_port=False) + client = Client(cluster) + address = client.scheduler_info()['address'] + Distributor = ClusterDaskDistributor(address=address) + + df = self.create_test_data_sample() + extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", + column_value="val", + distributor=Distributor) + + self.assertIsInstance(extracted_features, pd.DataFrame) + self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) + self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) + self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) + self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) + self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) + self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) + self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) + self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) + cluster.close() + diff --git a/tsfresh/utilities/distribution.py b/tsfresh/utilities/distribution.py index 857129725..8a43f5dfb 100644 --- a/tsfresh/utilities/distribution.py +++ b/tsfresh/utilities/distribution.py @@ -261,7 +261,7 @@ def distribute(self, func, partitioned_chunks, kwargs): """ if isinstance(partitioned_chunks, Iterable): - # since dask 2.0.0 client map no longer accepts iteratables + # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] @@ -319,7 +319,9 @@ def distribute(self, func, partitioned_chunks, kwargs): :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ - + if isinstance(partitioned_chunks, Iterable): + # since dask 2.0.0 client map no longer accepts iterables + partitioned_chunks = list(partitioned_chunks) result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist]