Merge remote-tracking branch 'origin/master' into feature/pep8

blue-yonder · Dec 3, 2019 · f0b83ec · f0b83ec
2 parents 430f029 + ea61188
commit f0b83ec
Show file tree

Hide file tree

Showing 10 changed files with 163 additions and 105 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ __pycache__/*
 .pydevproject
 .settings
 .idea
+.vscode
 
 # Package files
 *.egg

diff --git a/.travis.yml b/.travis.yml
@@ -1,123 +1,94 @@
-# Travis Build file for tsfresh
+## Travis Build file for tsfresh
+
 language: python
-# We want the pip folder to be cached, to speed up installation
+os: linux
+notifications:
+   slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq
+
+# We want the pip folder to be cached to speed up installation
 cache:
   directories:
     - $HOME/.cache/pip
+
+# Installation of packages
 install:
+  # Begin by updating pip to its newest version
   - pip install --upgrade pip wheel setuptools
+  # Then install the requirements as they are defined
   - pip install -r requirements.txt -r test-requirements.txt
-  - pip install -U .
-  - pip freeze
-jobs:
-  include:
-    - stage: Run tests on newest set of dependencies (Python 3.5.3)
-      env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
-      before_script:
-        - pip install --upgrade numpy
-        - pip install --upgrade pandas
-        - pip install --upgrade scikit-learn
-        - pip install --upgrade dask
-        - pip install --upgrade distributed
-        - pip install --upgrade scipy
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
-      python: 3.5.3
+  # Then install the package
+  - pip install .
+  # Now downgrade packages if required by environment variables
+  - "[ \"$NUMPY\" = latest ] && pip install --upgrade numpy || [ -z \"$NUMPY\" ] || pip install numpy==$NUMPY"
+  - "[ \"$PANDAS\" = latest ] && pip install --upgrade pandas || [ -z \"$PANDAS\" ] || pip install pandas==$PANDAS"
+  - "[ \"$SCIKITLEARN\" = latest ] && pip install --upgrade scikit-learn || [ -z \"$SCIKITLEARN\" ] || pip install scikit-learn==$SCIKITLEARN"
+  - "[ \"$DASK\" = latest ] && pip install --upgrade dask || [ -z \"$DASK\" ] || pip install dask==$DASK"
+  - "[ \"$DISTRIBUTED\" = latest ] && pip install --upgrade distributed || [ -z \"$DISTRIBUTED\" ] || pip install distributed==$DISTRIBUTED"
+  # need to downgrade tornado manually
+  - "[ \"$SCIPY\" = latest ] || [ -z \"$SCIPY\" ] || pip install tornado==4.5.3"
+  - "[ \"$SCIPY\" = latest ] && pip install --upgrade scipy || [ -z \"$SCIPY\" ] || pip install scipy==$SCIPY"
+  # Print out the pip versions for debugging
+  - pip list
+  # Only use two cores
+  - sed -i -e 's/-n auto/-n 2/g' setup.cfg
+  # we want to run coverage only inside a single job
+  - sed -e '/^\s*--cov tsfresh/d' setup.cfg > setup-nocov.cfg
 
-    - stage: Run tests on newest set of dependencies (Python 3.6)
-      env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
-      before_script:
-        - pip install --upgrade numpy
-        - pip install --upgrade pandas
-        - pip install --upgrade scikit-learn
-        - pip install --upgrade dask
-        - pip install --upgrade distributed
-        - pip install --upgrade scipy
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
-      python: 3.6
+# all jobs call pytest with setup-nocov.cfg to not run coverage test - except one which overrides this variable
+env:
+  global:
+    - PYTEST_ADDOPTS="-c setup-nocov.cfg"
 
+# The script to call on tests
+script: "if [ $TRAVIS_PULL_REQUEST = false ] && ! [ $TRAVIS_BRANCH = master ]; then pytest tests/units; else pytest tests; fi"
+
+# Now define the different stages
+jobs:
+  include:
+    # First stage: tests
+    - stage: Run tests
+      env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
+      python: 3.8
 
-    - stage: Run tests on newest set of dependencies (Python 3.7)
-      env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
-      before_script:
-        - pip install --upgrade numpy
-        - pip install --upgrade pandas
-        - pip install --upgrade scikit-learn
-        - pip install --upgrade dask
-        - pip install --upgrade distributed
-        - pip install --upgrade scipy
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
+    - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" PYTEST_ADDOPTS="-c setup.cfg"
       python: 3.7
+      # We only run coverage tests here (we also set a different setup.cfg script here)
       after_success:
         - coveralls
 
-    - stage: Run tests on oldest set of dependencies (Python 3.5.3)
-      env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
-      before_script:
-        - pip install numpy==1.12.0  # First version with official python 3.6 support.
-        - pip install pandas==0.20.3
-        - pip install scikit-learn==0.19.0
-        - pip install dask==0.15.2
-        - pip install distributed==1.18.3
-        # need to downgrade tornado manually
-        - pip install tornado==4.5.3
-        - pip install scipy==1.2.0
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
+    - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
+      python: 3.6
+
+    - env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
+      # newest pandas (>= 0.25) requires python >= 3.5.3
       python: 3.5.3
 
-    - stage: Run tests on oldest set of dependencies (Python 3.6)
-      env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
-      before_script:
-        - pip install numpy==1.12.0  # First version with official python 3.6 support.
-        - pip install pandas==0.20.3
-        - pip install scikit-learn==0.19.0
-        - pip install dask==0.15.2
-        - pip install distributed==1.18.3
-        # need to downgrade tornado manually
-        - pip install tornado==4.5.3
-        - pip install scipy==1.2.0
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
+    - env: NUMPY="1.15.1" PANDAS="0.23.2" SCIKITLEARN="0.19.2" DASK="0.16.1" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
+      # python 3.7 requires pandas >= 0.23.2
+      python: 3.7
+
+    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
       python: 3.6
 
-    - stage: Run tests on oldest set of dependencies (Python 3.7)
-      env: NUMPY="1.15.1", PANDAS="0.23.2", SCIKITLEARN="0.19.2", DASK="0.16.1", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
-      before_script:
-        - pip install numpy==1.15.1 # First version with official python 3.7 support.
-        - pip install pandas==0.23.2 # First version with official python 3.7 support.
-        - pip install scikit-learn==0.19.2 # First version with python 3.7 support.
-        - pip install dask==0.16.1
-        - pip install distributed==1.18.3
-        # need to downgrade tornado manually
-        - pip install tornado==4.5.3
-        - pip install scipy==1.2.0
-        - pip list
-      script:
-        - sed -i 's/\-n auto/\-n 2/g' setup.cfg
-        - "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
-      python: 3.7
+    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
+      python: 3.5.3
 
+
+    # Second stage: deploy
+    - stage: Deploy
+      if: tag IS present AND fork = false
+      python: 3.7
+      script: skip
       deploy:
         provider: pypi
         user: MaxChrist
         password:
           secure: Jh0Z69Mh+esOpegXyXoecFOkpMhaQaiJbQVEvVvQ2K1rCmCE20a19/TGfPUrynpqOYXZvvb5Ok6CtlzAi9J5huA3MRSf4iYPsUe8i7n0FK4JU5BP7VqM3/7cQZMdD5SeYFV3e3JURDcKYfoG7N+DNb+LfluYK5MBkRLhdEVRqSeHocPY4QRzzhJi1ljX99ThdRPrsQqYaD3tpZxJhbJDgHLtvMr39+407uQSDnubvFz3iu90DZiN2fIP5bEN6PDuaGXNZMA1p40DjSkGc7epg0U4vHn6CSya1nXlqjXUqXYJY5Ha2kbMAN7hfmU+gId09+FSHQRuanKJkRqSBksVgATCAeSAiqAe3EPAsG75ewhXDeusQZMzRy7DxQzjOJG9oIyWMVmZFlIoNlpg2eifN9uUc7FfyGHiVfWwUDslszpc/81hQViMPP0NoMAop4zcWR3ChCMnHMycPQEmWuV65WfL7yN6SuTokxSmepubPtFs+4UIlI0rgZWCHVIgGZqI8LFn958pLtpQ+32Ew8HGU3IiOfao9HbGreQ2Lgqo2L2EyNDWiHfJ3oZ1+6BP/1GqI6j7x7oPdwoE1jvY4CSC7iMAiieZNnrvywvmJpZB69CGefxQJzWcm+yD03QwNBFFaabCbKwbn+q3eUOUrPRuvTkhVLRWDxQNH/zaZyuZQ+Q=
         distributions: "sdist bdist_wheel"
-        on:
-          tags: true
-          repo: blue-yonder/tsfresh
-notifications:
-   slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq
+
+  # Some dependencies are not yet ready for Python 3.8
+  allow_failures:
+    - python: 3.8
+  # Make sure to not wait for Python 3.8
+  fast_finish: true
+
diff --git a/tests/integrations/dask-worker-space/global.lock b/tests/integrations/dask-worker-space/global.lock
diff --git a/tests/integrations/dask-worker-space/purge.lock b/tests/integrations/dask-worker-space/purge.lock
diff --git a/tests/integrations/test_just_call.py b/tests/integrations/test_just_call.py
@@ -0,0 +1,19 @@
+from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
+from tsfresh import extract_features
+from tsfresh.utilities.dataframe_functions import impute
+from tsfresh.feature_extraction import ComprehensiveFCParameters
+from tsfresh.utilities.distribution import ClusterDaskDistributor
+
+
+if __name__ == "__main__":
+    download_robot_execution_failures()
+    df, y = load_robot_execution_failures()
+
+    extraction_settings = ComprehensiveFCParameters()
+
+    X = extract_features(df,
+                        column_id='id', column_sort='time',
+                        default_fc_parameters=extraction_settings,
+                        chunksize=100, distributor=ClusterDaskDistributor("tcp://192.168.178.235:8786"))
+
+    print(X)
diff --git a/tests/units/feature_extraction/test_settings.py b/tests/units/feature_extraction/test_settings.py
@@ -20,7 +20,7 @@ class TestSettingsObject(TestCase):
     def test_from_column_raises_on_wrong_column_format(self):
 
         self.assertRaises(TypeError, from_columns, 42)
-        self.assertRaises(TypeError, from_columns, 42)
+        self.assertRaises(TypeError, from_columns, [42])
         self.assertRaises(ValueError, from_columns, ["This is not a column name"])
         self.assertRaises(ValueError, from_columns, ["This__neither"])
         self.assertRaises(ValueError, from_columns, ["This__also__not"])

diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py
@@ -114,10 +114,18 @@ def test_with_wrong_input(self):
         self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df,
                           "strange_id", "sort", "kind", "value")
 
+        test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
+        self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df,
+                          "id", "sort", "strange_kind", "value")
+
         test_df = pd.DataFrame([{"id": np.NaN, "kind": "a", "value": 3, "sort": 1}])
         self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df,
                           "id", "sort", "kind", "value")
 
+        test_df = pd.DataFrame([{"id": 0, "kind": np.NaN, "value": 3, "sort": 1}])
+        self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df,
+                          "id", "sort", "kind", "value")
+
         test_df = pd.DataFrame([{"id": 2}, {"id": 1}])
         test_dict = {"a": test_df, "b": test_df}
 
@@ -218,6 +226,11 @@ def test_with_wrong_input(self):
                           column_sort="sort", column_kind="kind",
                           rolling_direction=1)
 
+        self.assertRaises(ValueError, dataframe_functions.roll_time_series,
+                          df_or_dict=test_df, column_id=None,
+                          column_sort="sort", column_kind="kind",
+                          rolling_direction=1)
+
         test_df = {"a": pd.DataFrame([{"id": 0}])}
         self.assertRaises(ValueError, dataframe_functions.roll_time_series,
                           df_or_dict=test_df, column_id="id",
@@ -768,3 +781,7 @@ def test_get_id__correct_dict(self):
         df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
                    "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
         self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4})
+
+    def test_get_id_wrong(self):
+        other_type = np.array([1, 2, 3])
+        self.assertRaises(TypeError, get_ids, other_type, "id")
diff --git a/tests/units/utilities/test_distribution.py b/tests/units/utilities/test_distribution.py
@@ -5,15 +5,16 @@
 from unittest import TestCase
 import numpy as np
 import pandas as pd
+from distributed import LocalCluster, Client
 
 from tsfresh import extract_features
-from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor
+from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor, ClusterDaskDistributor
 from tests.fixtures import DataTestCase
 
 
 class MultiprocessingDistributorTestCase(TestCase):
 
-    def test_partion(self):
+    def test_partition(self):
 
         distributor = MultiprocessingDistributor(n_workers=1)
 
@@ -80,3 +81,50 @@ def test_local_dask_cluster_extraction_two_worker(self):
         self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
         self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
         self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
+
+
+class ClusterDaskDistributorTestCase(DataTestCase):
+
+    def test_dask_cluster_extraction_one_worker(self):
+        cluster = LocalCluster(n_workers=1, threads_per_worker=1, diagnostics_port=False)
+        client = Client(cluster)
+        address = client.scheduler_info()['address']
+        Distributor = ClusterDaskDistributor(address=address)
+
+        df = self.create_test_data_sample()
+        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
+                                              column_value="val",
+                                              distributor=Distributor)
+
+        self.assertIsInstance(extracted_features, pd.DataFrame)
+        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
+        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
+        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
+        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
+        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
+        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
+        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
+        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
+        cluster.close()
+
+    def test_dask_cluster_extraction_two_workers(self):
+        cluster = LocalCluster(n_workers=2, threads_per_worker=1, diagnostics_port=False)
+        client = Client(cluster)
+        address = client.scheduler_info()['address']
+        Distributor = ClusterDaskDistributor(address=address)
+
+        df = self.create_test_data_sample()
+        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
+                                              column_value="val",
+                                              distributor=Distributor)
+
+        self.assertIsInstance(extracted_features, pd.DataFrame)
+        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
+        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
+        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
+        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
+        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
+        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
+        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
+        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
+        cluster.close()
diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -1402,13 +1402,13 @@ def c3(x, lag):
 
     .. math::
 
-        \\frac{1}{n-2lag} \\sum_{i=0}^{n-2lag} x_{i + 2 \\cdot lag}^2 \\cdot x_{i + lag} \\cdot x_{i}
+        \\frac{1}{n-2lag} \\sum_{i=1}^{n-2lag} x_{i + 2 \\cdot lag} \\cdot x_{i + lag} \\cdot x_{i}
 
     which is
 
     .. math::
 
-        \\mathbb{E}[L^2(X)^2 \\cdot L(X) \\cdot X]
+        \\mathbb{E}[L^2(X) \\cdot L(X) \\cdot X]
 
     where :math:`\\mathbb{E}` is the mean and :math:`L` is the lag operator. It was proposed in [1] as a measure of
     non linearity in the time series.

diff --git a/tsfresh/utilities/distribution.py b/tsfresh/utilities/distribution.py
@@ -261,7 +261,7 @@ def distribute(self, func, partitioned_chunks, kwargs):
         """
 
         if isinstance(partitioned_chunks, Iterable):
-            # since dask 2.0.0 client map no longer accepts iteratables
+            # since dask 2.0.0 client map no longer accepts iterables
             partitioned_chunks = list(partitioned_chunks)
         result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
         return [item for sublist in result for item in sublist]
@@ -319,7 +319,9 @@ def distribute(self, func, partitioned_chunks, kwargs):
         :return: The result of the calculation as a list - each item should be the result of the application of func
             to a single element.
         """
-
+        if isinstance(partitioned_chunks, Iterable):
+            # since dask 2.0.0 client map no longer accepts iterables
+            partitioned_chunks = list(partitioned_chunks)
         result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
         return [item for sublist in result for item in sublist]
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,7 @@ __pycache__/* @@
     .pydevproject
     .settings
     .idea
+    .vscode
     # Package files
     *.egg
@@ Expand Down @@