Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into feature/pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
nils-braun committed Dec 3, 2019
2 parents 430f029 + ea61188 commit 28461df
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 105 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ __pycache__/*
.pydevproject
.settings
.idea
.vscode

# Package files
*.egg
Expand Down
167 changes: 69 additions & 98 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,123 +1,94 @@
# Travis Build file for tsfresh
## Travis Build file for tsfresh

language: python
# We want the pip folder to be cached, to speed up installation
os: linux
notifications:
slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq

# We want the pip folder to be cached to speed up installation
cache:
directories:
- $HOME/.cache/pip

# Installation of packages
install:
# Begin by updating pip to its newest version
- pip install --upgrade pip wheel setuptools
# Then install the requirements as they are defined
- pip install -r requirements.txt -r test-requirements.txt
- pip install -U .
- pip freeze
jobs:
include:
- stage: Run tests on newest set of dependencies (Python 3.5.3)
env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
before_script:
- pip install --upgrade numpy
- pip install --upgrade pandas
- pip install --upgrade scikit-learn
- pip install --upgrade dask
- pip install --upgrade distributed
- pip install --upgrade scipy
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
python: 3.5.3
# Then install the package
- pip install .
# Now downgrade packages if required by environment variables
- "[ \"$NUMPY\" = latest ] && pip install --upgrade numpy || [ -z \"$NUMPY\" ] || pip install numpy==$NUMPY"
- "[ \"$PANDAS\" = latest ] && pip install --upgrade pandas || [ -z \"$PANDAS\" ] || pip install pandas==$PANDAS"
- "[ \"$SCIKITLEARN\" = latest ] && pip install --upgrade scikit-learn || [ -z \"$SCIKITLEARN\" ] || pip install scikit-learn==$SCIKITLEARN"
- "[ \"$DASK\" = latest ] && pip install --upgrade dask || [ -z \"$DASK\" ] || pip install dask==$DASK"
- "[ \"$DISTRIBUTED\" = latest ] && pip install --upgrade distributed || [ -z \"$DISTRIBUTED\" ] || pip install distributed==$DISTRIBUTED"
# need to downgrade tornado manually
- "[ \"$SCIPY\" = latest ] || [ -z \"$SCIPY\" ] || pip install tornado==4.5.3"
- "[ \"$SCIPY\" = latest ] && pip install --upgrade scipy || [ -z \"$SCIPY\" ] || pip install scipy==$SCIPY"
# Print out the pip versions for debugging
- pip list
# Only use two cores
- sed -i -e 's/-n auto/-n 2/g' setup.cfg
# we want to run coverage only inside a single job
- sed -e '/^\s*--cov tsfresh/d' setup.cfg > setup-nocov.cfg

- stage: Run tests on newest set of dependencies (Python 3.6)
env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
before_script:
- pip install --upgrade numpy
- pip install --upgrade pandas
- pip install --upgrade scikit-learn
- pip install --upgrade dask
- pip install --upgrade distributed
- pip install --upgrade scipy
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
python: 3.6
# all jobs call pytest with setup-nocov.cfg to not run coverage test - except one which overrides this variable
env:
global:
- PYTEST_ADDOPTS="-c setup-nocov.cfg"

# The script to call on tests
script: "if [ $TRAVIS_PULL_REQUEST = false ] && ! [ $TRAVIS_BRANCH = master ]; then pytest tests/units; else pytest tests; fi"

# Now define the different stages
jobs:
include:
# First stage: tests
- stage: Run tests
env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
python: 3.8

- stage: Run tests on newest set of dependencies (Python 3.7)
env: NUMPY="latest", PANDAS="latest", SCIKITLEARN="latest", DASK="latest", DISTRIBUTED="latest", SCIPY="latest"
before_script:
- pip install --upgrade numpy
- pip install --upgrade pandas
- pip install --upgrade scikit-learn
- pip install --upgrade dask
- pip install --upgrade distributed
- pip install --upgrade scipy
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
- env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest" PYTEST_ADDOPTS="-c setup.cfg"
python: 3.7
# We only run coverage tests here (we also set a different setup.cfg script here)
after_success:
- coveralls

- stage: Run tests on oldest set of dependencies (Python 3.5.3)
env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
before_script:
- pip install numpy==1.12.0 # First version with official python 3.6 support.
- pip install pandas==0.20.3
- pip install scikit-learn==0.19.0
- pip install dask==0.15.2
- pip install distributed==1.18.3
# need to downgrade tornado manually
- pip install tornado==4.5.3
- pip install scipy==1.2.0
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
- env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
python: 3.6

- env: NUMPY="latest" PANDAS="latest" SCIKITLEARN="latest" DASK="latest" DISTRIBUTED="latest" SCIPY="latest"
# newest pandas (>= 0.25) requires python >= 3.5.3
python: 3.5.3

- stage: Run tests on oldest set of dependencies (Python 3.6)
env: NUMPY="1.12.0", PANDAS="0.20.3", SCIKITLEARN="0.19.0", DASK="0.15.2", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
before_script:
- pip install numpy==1.12.0 # First version with official python 3.6 support.
- pip install pandas==0.20.3
- pip install scikit-learn==0.19.0
- pip install dask==0.15.2
- pip install distributed==1.18.3
# need to downgrade tornado manually
- pip install tornado==4.5.3
- pip install scipy==1.2.0
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
- env: NUMPY="1.15.1" PANDAS="0.23.2" SCIKITLEARN="0.19.2" DASK="0.16.1" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
# python 3.7 requires pandas >= 0.23.2
python: 3.7

- env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
python: 3.6

- stage: Run tests on oldest set of dependencies (Python 3.7)
env: NUMPY="1.15.1", PANDAS="0.23.2", SCIKITLEARN="0.19.2", DASK="0.16.1", DISTRIBUTED="1.18.3", SCIPY="1.2.0"
before_script:
- pip install numpy==1.15.1 # First version with official python 3.7 support.
- pip install pandas==0.23.2 # First version with official python 3.7 support.
- pip install scikit-learn==0.19.2 # First version with python 3.7 support.
- pip install dask==0.16.1
- pip install distributed==1.18.3
# need to downgrade tornado manually
- pip install tornado==4.5.3
- pip install scipy==1.2.0
- pip list
script:
- sed -i 's/\-n auto/\-n 2/g' setup.cfg
- "if [ $TRAVIS_PULL_REQUEST == false ] && ! [ $TRAVIS_BRANCH == 'master' ]; then pytest tests/units -n2; else pytest tests; fi"
python: 3.7
- env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0"
python: 3.5.3


# Second stage: deploy
- stage: Deploy
if: tag IS present AND fork = false
python: 3.7
script: skip
deploy:
provider: pypi
user: MaxChrist
password:
secure: Jh0Z69Mh+esOpegXyXoecFOkpMhaQaiJbQVEvVvQ2K1rCmCE20a19/TGfPUrynpqOYXZvvb5Ok6CtlzAi9J5huA3MRSf4iYPsUe8i7n0FK4JU5BP7VqM3/7cQZMdD5SeYFV3e3JURDcKYfoG7N+DNb+LfluYK5MBkRLhdEVRqSeHocPY4QRzzhJi1ljX99ThdRPrsQqYaD3tpZxJhbJDgHLtvMr39+407uQSDnubvFz3iu90DZiN2fIP5bEN6PDuaGXNZMA1p40DjSkGc7epg0U4vHn6CSya1nXlqjXUqXYJY5Ha2kbMAN7hfmU+gId09+FSHQRuanKJkRqSBksVgATCAeSAiqAe3EPAsG75ewhXDeusQZMzRy7DxQzjOJG9oIyWMVmZFlIoNlpg2eifN9uUc7FfyGHiVfWwUDslszpc/81hQViMPP0NoMAop4zcWR3ChCMnHMycPQEmWuV65WfL7yN6SuTokxSmepubPtFs+4UIlI0rgZWCHVIgGZqI8LFn958pLtpQ+32Ew8HGU3IiOfao9HbGreQ2Lgqo2L2EyNDWiHfJ3oZ1+6BP/1GqI6j7x7oPdwoE1jvY4CSC7iMAiieZNnrvywvmJpZB69CGefxQJzWcm+yD03QwNBFFaabCbKwbn+q3eUOUrPRuvTkhVLRWDxQNH/zaZyuZQ+Q=
distributions: "sdist bdist_wheel"
on:
tags: true
repo: blue-yonder/tsfresh
notifications:
slack: tsfresh:uIzPVnlBQs32xE5jbq34f0Cq

# Some dependencies are not yet ready for Python 3.8
allow_failures:
- python: 3.8
# Make sure to not wait for Python 3.8
fast_finish: true

Empty file.
Empty file.
2 changes: 1 addition & 1 deletion tests/units/feature_extraction/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class TestSettingsObject(TestCase):
def test_from_column_raises_on_wrong_column_format(self):

self.assertRaises(TypeError, from_columns, 42)
self.assertRaises(TypeError, from_columns, 42)
self.assertRaises(TypeError, from_columns, [42])
self.assertRaises(ValueError, from_columns, ["This is not a column name"])
self.assertRaises(ValueError, from_columns, ["This__neither"])
self.assertRaises(ValueError, from_columns, ["This__also__not"])
Expand Down
17 changes: 17 additions & 0 deletions tests/units/utilities/test_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,18 @@ def test_with_wrong_input(self):
self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df,
"strange_id", "sort", "kind", "value")

test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
self.assertRaises(AttributeError, dataframe_functions._normalize_input_to_internal_representation, test_df,
"id", "sort", "strange_kind", "value")

test_df = pd.DataFrame([{"id": np.NaN, "kind": "a", "value": 3, "sort": 1}])
self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df,
"id", "sort", "kind", "value")

test_df = pd.DataFrame([{"id": 0, "kind": np.NaN, "value": 3, "sort": 1}])
self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_df,
"id", "sort", "kind", "value")

test_df = pd.DataFrame([{"id": 2}, {"id": 1}])
test_dict = {"a": test_df, "b": test_df}

Expand Down Expand Up @@ -218,6 +226,11 @@ def test_with_wrong_input(self):
column_sort="sort", column_kind="kind",
rolling_direction=1)

self.assertRaises(ValueError, dataframe_functions.roll_time_series,
df_or_dict=test_df, column_id=None,
column_sort="sort", column_kind="kind",
rolling_direction=1)

test_df = {"a": pd.DataFrame([{"id": 0}])}
self.assertRaises(ValueError, dataframe_functions.roll_time_series,
df_or_dict=test_df, column_id="id",
Expand Down Expand Up @@ -768,3 +781,7 @@ def test_get_id__correct_dict(self):
df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4})

def test_get_id_wrong(self):
other_type = np.array([1, 2, 3])
self.assertRaises(TypeError, get_ids, other_type, "id")
52 changes: 50 additions & 2 deletions tests/units/utilities/test_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
from unittest import TestCase
import numpy as np
import pandas as pd
from distributed import LocalCluster, Client

from tsfresh import extract_features
from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor
from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor, ClusterDaskDistributor
from tests.fixtures import DataTestCase


class MultiprocessingDistributorTestCase(TestCase):

def test_partion(self):
def test_partition(self):

distributor = MultiprocessingDistributor(n_workers=1)

Expand Down Expand Up @@ -80,3 +81,50 @@ def test_local_dask_cluster_extraction_two_worker(self):
self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))


class ClusterDaskDistributorTestCase(DataTestCase):

def test_dask_cluster_extraction_one_worker(self):
cluster = LocalCluster(n_workers=1, threads_per_worker=1, diagnostics_port=False)
client = Client(cluster)
address = client.scheduler_info()['address']
Distributor = ClusterDaskDistributor(address=address)

df = self.create_test_data_sample()
extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
column_value="val",
distributor=Distributor)

self.assertIsInstance(extracted_features, pd.DataFrame)
self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
cluster.close()

def test_dask_cluster_extraction_two_workers(self):
cluster = LocalCluster(n_workers=2, threads_per_worker=1, diagnostics_port=False)
client = Client(cluster)
address = client.scheduler_info()['address']
Distributor = ClusterDaskDistributor(address=address)

df = self.create_test_data_sample()
extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
column_value="val",
distributor=Distributor)

self.assertIsInstance(extracted_features, pd.DataFrame)
self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
cluster.close()
4 changes: 2 additions & 2 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1402,13 +1402,13 @@ def c3(x, lag):
.. math::
\\frac{1}{n-2lag} \\sum_{i=0}^{n-2lag} x_{i + 2 \\cdot lag}^2 \\cdot x_{i + lag} \\cdot x_{i}
\\frac{1}{n-2lag} \\sum_{i=1}^{n-2lag} x_{i + 2 \\cdot lag} \\cdot x_{i + lag} \\cdot x_{i}
which is
.. math::
\\mathbb{E}[L^2(X)^2 \\cdot L(X) \\cdot X]
\\mathbb{E}[L^2(X) \\cdot L(X) \\cdot X]
where :math:`\\mathbb{E}` is the mean and :math:`L` is the lag operator. It was proposed in [1] as a measure of
non linearity in the time series.
Expand Down
6 changes: 4 additions & 2 deletions tsfresh/utilities/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def distribute(self, func, partitioned_chunks, kwargs):
"""

if isinstance(partitioned_chunks, Iterable):
# since dask 2.0.0 client map no longer accepts iteratables
# since dask 2.0.0 client map no longer accepts iterables
partitioned_chunks = list(partitioned_chunks)
result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
return [item for sublist in result for item in sublist]
Expand Down Expand Up @@ -319,7 +319,9 @@ def distribute(self, func, partitioned_chunks, kwargs):
:return: The result of the calculation as a list - each item should be the result of the application of func
to a single element.
"""

if isinstance(partitioned_chunks, Iterable):
# since dask 2.0.0 client map no longer accepts iterables
partitioned_chunks = list(partitioned_chunks)
result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
return [item for sublist in result for item in sublist]

Expand Down

0 comments on commit 28461df

Please sign in to comment.