diff --git a/.travis.yml b/.travis.yml index f0f2d5c9..d194fc53 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ # Config file for automatic testing at travis-ci.org +dist: xenial language: python python: - 3.6 diff --git a/HISTORY.md b/HISTORY.md index 0735309b..6c522f7d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,40 @@ # History +## 0.2.0 + +### New Features + +* Publish the pipelines as an `entry_point` +[Issue #175](https://github.com/HDI-Project/MLPrimitives/issues/175) by @csala + +### Primitive Improvements + +* Improve pandas.DataFrame.resample primitive [Issue #177](https://github.com/HDI-Project/MLPrimitives/issues/177) by @csala +* Improve `feature_extractor` primitives [Issue #183](https://github.com/HDI-Project/MLPrimitives/issues/183) by @csala +* Improve `find_anomalies` primitive [Issue #180](https://github.com/HDI-Project/MLPrimitives/issues/180) by @AlexanderGeiger + +### Bug Fixes + +* Typo in the primitive keras.Sequential.LSTMTimeSeriesRegressor [Issue #176](https://github.com/HDI-Project/MLPrimitives/issues/176) by @DanielCalvoCerezo + + +## 0.1.10 + +### New Features + +* Add function to run primitives without a pipeline [Issue #43](https://github.com/HDI-Project/MLPrimitives/issues/43) by @csala + +### New Pipelines + +* Add pipelines for all the MLBlocks examples [Issue #162](https://github.com/HDI-Project/MLPrimitives/issues/162) by @csala + +### Primitive Improvements + +* Add Early Stopping to `keras.Sequential.LSTMTimeSeriesRegressor` primitive [Issue #156](https://github.com/HDI-Project/MLPrimitives/issues/156) by @csala +* Make FeatureExtractor primitives accept Numpy arrays [Issue #165](https://github.com/HDI-Project/MLPrimitives/issues/165) by @csala +* Add window size and pruning to the `timeseries_anomalies.find_anomalies` primitive [Issue #160](https://github.com/HDI-Project/MLPrimitives/issues/160) by @csala + + ## 0.1.9 ### New Features diff --git a/MANIFEST.in b/MANIFEST.in index afa9820c..e524e581 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,7 +3,8 @@ include CONTRIBUTING.rst include HISTORY.md include LICENSE include README.md -include mlprimitives/jsons/*.json + +recursive-include mlprimitives *.json recursive-include tests * recursive-exclude * __pycache__ diff --git a/Makefile b/Makefile index 081e80a6..a826c3c7 100644 --- a/Makefile +++ b/Makefile @@ -88,8 +88,8 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a lint: ## check style with flake8 and isort flake8 mlprimitives tests isort -c --recursive mlprimitives tests - find pipelines -name '*.json' | xargs -n1 -I{} bash -c "diff -q {} <(python -m json.tool {})" - find mlprimitives/jsons -name '*.json' | xargs -n1 -I{} bash -c "diff -q {} <(python -m json.tool {})" + find mlprimitives/pipelines -name '*.json' | xargs -n1 -I{} bash -c "diff -q {} <(python -m json.tool {})" + find mlprimitives/primitives -name '*.json' | xargs -n1 -I{} bash -c "diff -q {} <(python -m json.tool {})" .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort @@ -101,8 +101,8 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort autopep8 --in-place --recursive --aggressive tests isort --apply --atomic --recursive tests - find pipelines -name '*.json' | xargs -n1 -I{} bash -c "python -m json.tool {} {}.tmp && mv {}.tmp {}" - find mlprimitives/jsons -name '*.json' | xargs -n1 -I{} bash -c "python -m json.tool {} {}.tmp && mv {}.tmp {}" + find mlprimitives/pipelines -name '*.json' | xargs -n1 -I{} bash -c "python -m json.tool {} {}.tmp && mv {}.tmp {}" + find mlprimitives/primitives -name '*.json' | xargs -n1 -I{} bash -c "python -m json.tool {} {}.tmp && mv {}.tmp {}" # TEST TARGETS @@ -117,7 +117,7 @@ test-all: ## run tests on every Python version with tox .PHONY: test-pipelines test-pipelines: ## Test all the pipelines from the pipelines folder - mlprimitives test pipelines/*.json + mlprimitives test mlprimitives/pipelines/*.json .PHONY: coverage coverage: ## check code coverage quickly with the default Python diff --git a/README.md b/README.md index 29df2732..b574d4bb 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

-“DAI-Lab” +“DAI-Lab” An open source project from Data to AI Lab at MIT.

@@ -11,111 +11,351 @@ # MLPrimitives -MLBlocks Primitives +Pipelines and primitives for machine learning and data science. - Free software: MIT license - Documentation: https://HDI-Project.github.io/MLPrimitives +# Overview -## Overview +This repository contains primitive annotations to be used by the MLBlocks library, as well as +the necessary Python code to make some of them fully compatible with the MLBlocks API requirements. -This repository contains JSON primitives to be used by the MLBlocks library, as well as the -necessary Python code to make some of them fully compatible with the MLBlocks API requirements. +There is also a collection of custom primitives contributed directly to this library, which either +combine third party tools or implement new functionalities from scratch. -There is also a collection of custom primitives contributed directly to this library, which -either combine third party tools or implement new functionalities from scratch. +## Why did we create this library? +* Too many libraries in a fast growing field +* Huge societal need to build machine learning apps +* Domain expertise resides at several places (knowledge of math) +* No documented information about hyperparameters, behavior... -## Project Structure +# Installation -The project is divided in three parts: +## Requirements -### The `mlprimitives` package +**MLPrimitives** has been developed and tested on [Python 3.5, and 3.6](https://www.python.org/downloads/) -The mlprimitives folder is where all the Python code can be found. +Also, although it is not strictly required, the usage of a +[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid +interfering with other software installed in the system where **MLPrimitives** is run. -Several sub-modules exist inside it, for the different types of primitives implemented, including -the `mlprimitives.adapters` module, which has a special role in the integration of third -party tools that do not directly fit the MLBlocks requirements. +These are the minimum commands needed to create a virtualenv using python3.6 for **MLPrimitives**: -### The `mlprimitives/jsons` folder +```bash +pip install virtualenv +virtualenv -p $(which python3.6) mlprimitives-venv +``` -The `mlprimitives/jsons` folder contains the JSON annotations for the primitives. +Afterwards, you have to execute this command to have the virtualenv activated: -This folder has a flat structure, without subfolders, and all the primitive JSONs are named -after the Fully Qualified Name of the annotated primitive (function or class). +```bash +source mlprimitives-venv/bin/activate +``` -As a result of this, sorting the JSON files alphabetically shows them grouped by library, which -makes browsing them and seeing what tools are implemented easy. +Remember about executing it every time you start a new console to work on **MLPrimitives**! -### The `tests` folder +## Install using Pip -Here are the unit tests for the Python code, as well as some validation tests for the JSON -annotations. +After creating the virtualenv and activating it, we recommend using +[pip](https://pip.pypa.io/en/stable/) in order to install **MLPrimitives**: +```bash +pip install mlprimitives +``` -## Primitive Types +This will pull and install the latest stable release from [PyPi](https://pypi.org/). -Three types of primitives can be found in this repository: +## Install from Source -### Primitives that can be directly integrated to MLBlocks +Alternatively, with your virtualenv activated, you can clone the repository and install it from +source by running `make install` on the `stable` branch: -The simplest type of primitives are the ones that can be directly integrated to MLBlocks -using nothing else than a single JSON annotation file. +```bash +git clone git@github.com:HDI-Project/MLPrimitives.git +cd MLPrimitives +git checkout stable +make install +``` -These JSON files can be found in the `mlblocks_primitives` folder, and integrate functions -or classes that comply with the following requirements: +## Install for Development -* Tunable hyperparameters are simple values of the supported basic types: str, bool, int or float. -* Creating the class instance or calling the fit or produce methods does not require building - any complex structure before the call is made. -* The fitting and predicting phase consist on a single method or function call each. +If you want to contribute to the project, a few more steps are required to make the project ready +for development. -A good example of this type of primitives are most of the estimators from the scikit-learn -library. +First, please head to [the GitHub page of the project](https://github.com/HDI-Project/MLPrimitives) +and make a fork of the project under you own username by clicking on the **fork** button on the +upper right corner of the page. -### Primitives that need a Python adapter to be integrated to MLBlocks +Afterwards, clone your fork and create a branch from master with a descriptive name that includes +the number of the issue that you are going to work on: -The second simplest type of primitives are the ones that need some kind of adaptation process -to be integrated to MLBlocks, but whose behaviour is not altered in any way by this process. +```bash +git clone git@github.com:{your username}/MLPrimitives.git +cd MLPrimitives +git branch issue-xx-cool-new-feature master +git checkout issue-xx-cool-new-feature +``` -These primitives consist of some Python code which can be found in the `mlprimitives.adapters` -module, as well as JSON annotations that point at the corresponding functions or classes, -which can be found in the `mlblocs_primitives` folder. +Finally, install the project with the following command, which will install some additional +dependencies for code linting and testing. -The type of primitives that are integrated in this way are the ones that have some of these -characteristics: +```bash +make install-develop +``` -* Need some additional steps after the instantiation in order to be prepared to run. -* The tunable hyperparameters need some kind of transformation or instantiation before they can - be passed to the primitive. -* The primitive cannot be directly applied to the inputs or the outputs need to be manipulated in - some way before they can be passed to any other primitive. +Make sure to use them regularly while developing by running the commands `make lint` and `make test`. -Some examples of these primitives are the Keras models, which need to be built in several steps -and later on compiled before they can be used, or some image transformation primitives which -need to be applied to the images one by one. +# Quickstart -### Custom primitives +This section is a short series of tutorials to help you getting started with MLPrimitives. -The third type are custom primitives implemented specifically for this library. +In the following steps you will learn how to load and run a primitive on some data. -These custom primitives may be using third party tools or implemented from scratch, but if they -use third party tools they alter in some way their native behavior to add new functionalities -to them. +Later on you will learn how to evaluate and improve the performance of a primitive by tuning +its hyperparameters. -This type of primitives consist of Python code from the `mlprimitives` module, as well as the -corresponding JSON annotations, which can also be found in the `mlblocks_primitives` folder. +## Running a Primitive +In this first tutorial, we will be executing a single primitive for data transformation. -## Contributing +### 1. Load a Primitive -This is a community driven project and all contributions are more than welcome, from simple -feedback to the most complex coding contributions. +The first step in order to run a primitive is to load it. -If you have anything that you want to ask, request or contribute, please check the -[contributing section in the documentation][contributing-docs], and do not hesitate -to open [GitHub Issue](https://github.com/HDI-Project/MLPrimitives/issues), even if it is -to ask a simple question. +This will be done using the `mlprimitives.load_primitive` function, which will +load the indicated primitive as an [MLBlock Object from MLBlocks](https://hdi-project.github.io/MLBlocks/api/mlblocks.html#mlblocks.MLBlock) -[contributing-docs]: https://hdi-project.github.io/MLPrimitives/contributing.html +In this case, we will load the `mlprimitives.custom.feature_extraction.CategoricalEncoder` +primitive. + +```python +from mlprimitives import load_primitive + +primitive = load_primitive('mlprimitives.custom.feature_extraction.CategoricalEncoder') +``` + +### 2. Load some data + +The CategoricalEncoder is a transformation primitive which applies one-hot encoding to all the +categorical columns of a `pandas.DataFrame`. + +So, in order to be able to run our primitive, we will first load some data that contains +categorical columns. + +This can be done with the `mlprimitives.datasets.load_census` function: + +```python +from mlprimitives.datasets import load_census + +dataset = load_census() +``` + +This dataset object has an attribute `data` which contains a table with several categorical +columns. + +We can have a look at this table by executing `dataset.data.head()`, which will return a +table like this: + +```python + 0 1 2 +age 39 50 38 +workclass State-gov Self-emp-not-inc Private +fnlwgt 77516 83311 215646 +education Bachelors Bachelors HS-grad +education-num 13 13 9 +marital-status Never-married Married-civ-spouse Divorced +occupation Adm-clerical Exec-managerial Handlers-cleaners +relationship Not-in-family Husband Not-in-family +race White White White +sex Male Male Male +capital-gain 2174 0 0 +capital-loss 0 0 0 +hours-per-week 40 13 40 +native-country United-States United-States United-States +``` + +### 3. Fit the primitive + +In order to run our pipeline, we first need to fit it. + +This is the process where it analyzes the data to detect which columns are categorical + +This is done by calling its `fit` method and assing the `dataset.data` as `X`. + +```python +primitive.fit(X=dataset.data) +``` + +### 4. Produce results + +Once the pipeline is fit, we can process the data by calling the `produce` method of the +primitive instance and passing agin the `data` as `X`. + +```python +transformed = primitive.produce(X=dataset.data) +``` + +After this is done, we can see how the transformed data contains the newly generated +one-hot vectors: + +``` + 0 1 2 3 4 +age 39 50 38 53 28 +fnlwgt 77516 83311 215646 234721 338409 +education-num 13 13 9 7 13 +capital-gain 2174 0 0 0 0 +capital-loss 0 0 0 0 0 +hours-per-week 40 13 40 40 40 +workclass= Private 0 0 1 1 1 +workclass= Self-emp-not-inc 0 1 0 0 0 +workclass= Local-gov 0 0 0 0 0 +workclass= ? 0 0 0 0 0 +workclass= State-gov 1 0 0 0 0 +workclass= Self-emp-inc 0 0 0 0 0 +... ... ... ... ... ... +``` + +## Tuning a Primitive + +In this short tutorial we will teach you how to evaluate the performance of a primitive +and improve its performance by modifying its hyperparameters. + +To do so, we will load a primitive that can learn from the transformed data that we just +generated and later on make predictions based on new data. + +### 1. Load another primitive + +Firs of all, we will load the `xgboost.XGBClassifier` primitive that we will use afterwards. + +```python +primitive = load_primitive('xgboost.XGBClassifier') +``` + +### 2. Split the dataset + +Before being able to evaluate the primitive perfomance, we need to split the data in two +parts: train, which will be used for the primitive to learn, and test, which will be used +to make the predictions that later on will be evaluated. + +In order to do this, we will get the first 75% of rows from the transformed data that we +obtained above and call it `X_train`, and then set the next 25% of rows as `X_test`. + +```python +train_size = int(len(transformed) * 0.75) +X_train = transformed.iloc[:train_size] +X_test = transformed.iloc[train_size:] +``` + +Similarly, we need to obtain the `y_train` and `y_test` variables containing the corresponding +output values. + +```python +y_train = dataset.target[:train_size] +y_test = dataset.target[train_size:] +``` + +### 3. Fit the new primitive + +Once we have have splitted the data, we can fit the primitive by passing `X_train` and `y_train` +to its `fit` method. + +```python +primitive.fit(X=X_train, y=y_train) +``` + +### 4. Make predictions + +Once the primitive has been fitted, we can produce predictions using the `X_test` data as input. + +```python +predictions = primitive.produce(data=X_test) +``` + +### 5. Evalute the performance + +We can now evaluate how good the predictions from our primitive are by using the `score` +method from the `dataset` object on both the expected output and the real output from the +primitive: + +```python +dataset.score(y_test, predictions) +``` + +This will output a float value between 0 and 1 indicating how good the predicitons are, being +0 the worst score possible and 1 the best one. + +In this case we will obtain a score around 0.866 + +### 6. Set new hyperparameter values + +In order to improve the performance of our primitive we will try to modify a couple of its +hyperparameters. + +First we will see which hyperparameter values the primitive has by calling its +`get_hyperparameters` method. + +```python +primitive.get_hyperparameters() +``` + +which will return a dictionary like this: + +```python +{ + "n_jobs": -1, + "n_estimators": 100, + "max_depth": 3, + "learning_rate": 0.1, + "gamma": 0, + "min_child_weight": 1 +} +``` + +Next, we will see which are the valid values for each one of those hyperparameters by calling its +`get_tunable_hyperparameters` method: + +```python +primitive.get_tunable_hyperparameters() +``` + +For example, we will see that the `max_depth` hyperparameter has the following specification: + +```python +{ + "type": "int", + "default": 3, + "range": [ + 3, + 10 + ] +} +``` + +Next, we will choose a valid value, for example 7, and set it into the pipeline using the +`set_hyperparameters` method: + +```python +primitive.set_hyperparameters({'max_depth': 7}) +``` + +### 7. Re-evaluate the performance + +Once the new hyperparameter value has been set, we repeat the fit/train/score cycle to +evaluate the performance of this new hyperparameter value: + +```python +primitive.fit(X=X_train, y=y_train) +predictions = primitive.produce(data=X_test) +dataset.score(y_test, predictions) +``` + +This time we should see that the performance has improved to a value around 0.724 + +## What's Next? + +Do you want to [learn more about how the project](https://hdi-project.github.io/MLPrimitives/getting_started/concepts.html), +about [how to contribute to it](https://hdi-project.github.io/MLPrimitives/community/contributing.html) +or browse the [API Reference](https://hdi-project.github.io/MLPrimitives/api/mlprimitives.html)? +Please check the corresponding sections of the [documentation](https://hdi-project.github.io/MLPrimitives/)! diff --git a/docs/community/adapters.rst b/docs/community/adapters.rst index a962a8d4..2313b206 100644 --- a/docs/community/adapters.rst +++ b/docs/community/adapters.rst @@ -55,7 +55,7 @@ If you want to create a new adapter, please follow these steps: ``contributors`` list! 4. Add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/your.pipeline.json``. + ``mlprimitives test mlprimitives/pipelines/your.pipeline.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. @@ -109,7 +109,7 @@ and the existing adapter can be safely modified, do the following steps: ``contributors`` list! 4. Add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/your.pipeline.json``. + ``mlprimitives test mlprimitives/pipelines/your.pipeline.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. 5. Make sure that all the primitives that existed before that use the same adapter still diff --git a/docs/community/annotations.rst b/docs/community/annotations.rst index 4517e5e6..1c0b0cb4 100644 --- a/docs/community/annotations.rst +++ b/docs/community/annotations.rst @@ -38,7 +38,7 @@ In this case, please follow these steps: source code. And don't forget to add you name and e-mail address to the ``contributors`` list! 4. Add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/your.pipeline.json``. + ``mlprimitives test mlprimitives/pipelines/your.pipeline.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. @@ -73,7 +73,7 @@ In this case, please follow these steps: title, as this makes keeping track of the history of the project easier in the long run. Don't forget to add you name and e-mail address to the ``contributors`` list while you are at it! 5. Make sure that the annotation still works by testing the corresponding pipeline. Normally, - this can be done by running the command ``mlprimitives test pipelines/your.pipeline.json``. + this can be done by running the command ``mlprimitives test mlprimitives/pipelines/your.pipeline.json``. 6. Review your changes and make sure that everything continues to work properly by executing the ``make test-all`` command. 7. Push all your changes to GitHub and open a Pull Request, indicating in the description which @@ -118,7 +118,7 @@ In this case, please follow these steps: add you name and e-mail address to the ``contributors`` list while you are at it! 4. Add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/your.pipeline.json``. + ``mlprimitives test mlprimitives/pipelines/your.pipeline.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. diff --git a/docs/community/contributing.rst b/docs/community/contributing.rst index a3bf65dd..5ade8534 100644 --- a/docs/community/contributing.rst +++ b/docs/community/contributing.rst @@ -6,7 +6,7 @@ Contributing Guidelines Ready to contribute with your own code? Great! Before diving deeper into the contributing guidelines, please make sure to having read -the :ref:`concepts` section and to have gone through the :ref:`development` guide. +the :ref:`concepts` section and to have installed the project and its development dependencies. Afterwards, please make sure to read the following contributing guidelines carefully, and later on head to the step-by-step guides for each possible type of contribution. @@ -47,10 +47,10 @@ When doing so, make sure to follow these guidelines: $ make coverage # Get the coverage report 6. If you are developing new primitives that can work as part of a Pipeline, please also - add a demo pipeline inside the ``pipelines`` folder and validate that it is running - properly with the command:: + add a demo pipeline inside the ``mlprimitives/pipelines`` folder and validate that it is + running properly with the command:: - $ mlprimitives test pipelines/the_file_of_your_pipeline.json + $ mlprimitives test mlprimitives/pipelines/the_file_of_your_pipeline.json 7. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: diff --git a/docs/community/custom.rst b/docs/community/custom.rst index db98b664..edd9817f 100644 --- a/docs/community/custom.rst +++ b/docs/community/custom.rst @@ -52,7 +52,7 @@ If you want to create a new custom primitive, please follow these steps: e-mail address to the ``contributors`` list! 3. Add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/mlprimitives.candidates.your_module.YourPrimitive.json``. + ``mlprimitives test mlprimitives/pipelines/mlprimitives.candidates.your_module.YourPrimitive.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. @@ -104,7 +104,7 @@ and that the existing primitive can be safely modified, do the following steps: 3. If you are creating a new annotation, also add a pipeline annotation that uses your primitive inside the pipelines folder, named exactly like your primitive, and test it with the command - ``mlprimitives test pipelines/mlprimitives.candidates.your_module.YourPrimitive.json``. + ``mlprimitives test mlprimitives/pipelines/mlprimitives.candidates.your_module.YourPrimitive.json``. If adding a pipeline is not possible for any reason, please inform the maintainers, as this probably means that a new dataset needs to be added. 4. Make sure that all the annotations that existed before that use the same primitive still diff --git a/docs/conf.py b/docs/conf.py index 5296ad3f..767be03e 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,8 +19,6 @@ # absolute, like shown here. import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -# from recommonmark.transform import AutoStructify import mlprimitives @@ -38,10 +36,8 @@ 'sphinx.ext.githubpages', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', - # 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', - # 'sphinx.ext.autosectionlabel', ] ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] @@ -53,10 +49,6 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -# source_parsers = { -# '.md': CommonMarkParser, -# } - # The master toctree document. master_doc = 'index' @@ -134,7 +126,8 @@ # If given, this must be the name of an image file (path relative to the # configuration directory) that is the logo of the docs. It is placed at # the top of the sidebar; its width should therefore not exceed 200 pixels. -# html_logo = 'images/mlblocks-logo-small.png' +# html_logo = 'images/dai-logo-white-200.png' +# html_logo = 'images/mlblocks-logo-no-text-200.png' # -- Options for HTMLHelp output --------------------------------------- diff --git a/docs/getting_started/concepts.rst b/docs/getting_started/concepts.rst index ddeeab41..157ca45b 100644 --- a/docs/getting_started/concepts.rst +++ b/docs/getting_started/concepts.rst @@ -75,7 +75,7 @@ have these characteristics: In this case, no additional code is necessary to adapt them and those blocks can be brought into MLPrimitives using nothing else than a single JSON annotation file, which can be found in the -`mlprimitives/jsons folder`_. +`mlprimitives/primitives folder`_. Examples ******** @@ -108,7 +108,7 @@ Some examples of these primitives are the Keras models, which need to be built i and later on compiled before they can be used, or some image transformation primitives which need to be applied to the images one by one. These primitives consist of some Python code which can be found in the ``mlprimitives.adapters`` module, as well as JSON annotations that point at the -corresponding functions or classes, which can also be found in the `mlprimitives/jsons folder`_. +corresponding functions or classes, which can also be found in the `mlprimitives/primitives folder`_. Examples ******** @@ -126,7 +126,7 @@ primitives may be implemented from scratch or they may be using third party tool as to alter the third party tool’s native behavior to add new functionalities. This type of primitives consist of Python code that can be found inside the `mlprimitives/custom module`_, -as well as the corresponding JSON annotations, which can also be found in the `mlprimitives/jsons folder`_. +as well as the corresponding JSON annotations, which can also be found in the `mlprimitives/primitives folder`_. Examples ******** @@ -158,9 +158,9 @@ through a deeper code review in search of possible improvements in terms of perf functionality refinements -.. _mlprimitives/jsons folder: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/jsons +.. _mlprimitives/primitives folder: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/primitives .. _mlprimitives/custom module: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/custom .. _mlprimitives/candidates module: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/candidates -.. _numpy.argmax: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/jsons/numpy.argmax.json -.. _sklearn.preprocessing.StandardScaler: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/jsons/sklearn.preprocessing.StandardScaler.json -.. _xgboost.XGBClassifier: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/jsons/xgboost.XGBClassifier.json +.. _numpy.argmax: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/primitives/numpy.argmax.json +.. _sklearn.preprocessing.StandardScaler: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/primitives/sklearn.preprocessing.StandardScaler.json +.. _xgboost.XGBClassifier: https://github.com/HDI-Project/MLPrimitives/blob/master/mlprimitives/primitives/xgboost.XGBClassifier.json diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst deleted file mode 100644 index bf1b5cb7..00000000 --- a/docs/getting_started/install.rst +++ /dev/null @@ -1,71 +0,0 @@ -.. highlight:: shell - -Installation -============ - -Stable release --------------- - -To install MLPrimitives, run this command in your terminal: - -.. code-block:: console - - $ pip install mlprimitives - -This is the preferred method to install MLPrimitives, as it will always install the most recent -stable release. - -If you don't have `pip`_ installed, this `Python installation guide`_ can guide -you through the process. - -.. _pip: https://pip.pypa.io -.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ - -From sources ------------- - -The sources for MLPrimitives can be downloaded from the `Github repo`_. - -You can either clone the ``stable`` branch form the public repository: - -.. code-block:: console - - $ git clone --branch stable git://github.com/HDI-Project/MLPrimitives - -Or download the `tarball`_: - -.. code-block:: console - - $ curl -OL https://github.com/HDI-Project/MLPrimitives/tarball/stable - -Once you have a copy of the source, you can install it with this command: - -.. code-block:: console - - $ make install - -.. _development: - -Development Setup ------------------ - -If you want to make changes in `MLPrimitives` and contribute them, you will need to prepare -your environment to do so. - -These are the required steps: - -1. Fork the MLPrimitives `Github repo`_. - -2. Clone your fork locally:: - - $ git clone git@github.com:your_name_here/MLPrimitives.git - -3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, - this is how you set up your fork for local development:: - - $ mkvirtualenv MLPrimitives - $ cd MLPrimitives/ - $ make install-develop - -.. _Github repo: https://github.com/HDI-Project/MLPrimitives -.. _tarball: https://github.com/HDI-Project/MLPrimitives/tarball/stable diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst deleted file mode 100644 index 01c3da40..00000000 --- a/docs/getting_started/quickstart.rst +++ /dev/null @@ -1,114 +0,0 @@ -Quickstart -========== - -Below is a short tutorial that will show you how to get started using MLPrimitives with `MLBlocks`_. - -In this tutorial we will learn how to: - -* Create a pipeline using multiple primitives -* Obtain the list of tunable hyperparameters from the pipeline -* Specify hyperparameters for each primitive in the pipeline -* Fit the pipeline using training data -* Use the pipeline to make predictions from new data - -Creating a pipeline -------------------- - -With MLBlocks, creating a pipeline is as simple as specifying a list of MLPrimitives and passing -them to the ``MLPipeline``: - -.. ipython:: python - - from mlblocks import MLPipeline - primitives = [ - 'mlprimitives.custom.feature_extraction.StringVectorizer', - 'sklearn.ensemble.RandomForestClassifier', - ] - pipeline = MLPipeline(primitives) - -Optionally, specific hyperparameters can be also set by specifying them in a dictionary: - -.. ipython:: python - - hyperparameters = { - 'sklearn.ensemble.RandomForestClassifier': { - 'n_estimators': 100 - } - } - pipeline = MLPipeline(primitives, hyperparameters) - -Once the pipeline has been instantiated, we can easily see what hyperparameters have been set -for each block, by calling the ``get_hyperparameters``. - -The output of this method is a dictionary which has the name of each block as keys and -a dictionary with the hyperparameters of the corresponding block as values. - -.. ipython:: python - - pipeline.get_hyperparameters() - -Tunable Hyperparameters ------------------------ - -One of the main features of MLPrimitives is the possibility to indicate the type and possible -values that each primitive hyperparameter accepts. - -The list of possible hyperparameters and their details can easily be obtained from the pipeline -instance by calling its ``get_tunable_hyperparameters``. - -The output of this method is a dictionary that contains the list of tunable hyperparameters -for each block in the pipeline, ready to be passed to any hyperparameter tuning library such -as `BTB`_. - -.. ipython:: python - - pipeline.get_tunable_hyperparameters() - -Setting Hyperparameters ------------------------ - -Modifying the hyperparameters of an already instantiated pipeline can be done using the -``set_hyperparameters method``, which expects a dictionary with the same format as the returned -by the ``get_hyperparameters method``. - -Note that if a subset of the hyperparameters is passed, only these will be modified, and the -other ones will remain unmodified. - -.. ipython:: python - - new_hyperparameters = { - 'sklearn.ensemble.RandomForestClassifier#1': { - 'max_depth': 15 - } - } - pipeline.set_hyperparameters(new_hyperparameters) - hyperparameters = pipeline.get_hyperparameters() - hyperparameters['sklearn.ensemble.RandomForestClassifier#1']['max_depth'] - -Making predictions ------------------- - -Once we have created the pipeline with the desired hyperparameters we can fit it -and then use it to make predictions on new data. - -To do this, we first call the ``fit`` method passing the training data and the corresponding -labels. - -.. ipython:: python - - from mlblocks.datasets import load_personae - dataset = load_personae() - X_train, X_test, y_train, y_test = dataset.get_splits(1) - pipeline.fit(X_train, y_train) - -Once we have fitted our model to our data, we can call the ``predict`` method passing new data -to obtain predictions from the pipeline. - -.. ipython:: python - - predictions = pipeline.predict(X_test) - predictions - dataset.score(y_test, predictions) - -.. _MLBlocks: https://github.com/HDI-Project/MLBlocks -.. _BTB: https://github.com/HDI-Project/BTB diff --git a/docs/images/dai-logo.png b/docs/images/dai-logo.png deleted file mode 100644 index 4abe1184..00000000 Binary files a/docs/images/dai-logo.png and /dev/null differ diff --git a/docs/index.rst b/docs/index.rst index 7cf5c745..db5588c2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,36 +1,10 @@ -Welcome to MLPrimitives! -======================== - -.. figure:: images/dai-logo.png - :width: 300 px - :alt: DAI-Lab Logo - - An open source project from Data to AI Lab at MIT. - -Overview --------- - -This repository contains primitive annotations to be used by the MLBlocks library, as well as -the necessary Python code to make some of them fully compatible with the MLBlocks API requirements. -There is also a collection of custom primitives contributed directly to this library, which either -combine third party tools or implement new functionalities from scratch. - -Why did we create this library? -------------------------------- - -* Too many libraries in a fast growing field -* Huge societal need to build machine learning apps -* Domain expertise resides at several places (knowledge of math) -* No documented information about hyperparameters, behavior... - +.. include:: readme.rst .. toctree:: :caption: Getting Started :maxdepth: 2 - Welcome - getting_started/install - getting_started/quickstart + Overview getting_started/concepts .. toctree:: diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 00000000..97d49585 --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1 @@ +.. mdinclude:: ../README.md diff --git a/mlprimitives/__init__.py b/mlprimitives/__init__.py index a576f56c..619458c0 100644 --- a/mlprimitives/__init__.py +++ b/mlprimitives/__init__.py @@ -4,8 +4,17 @@ __author__ = 'MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.10-dev' +__version__ = '0.2.1-dev' import os -MLPRIMITIVES_JSONS_PATH = os.path.join(os.path.dirname(__file__), 'jsons') +from mlblocks import MLBlock + +MLBLOCKS_PRIMITIVES = os.path.join(os.path.dirname(__file__), 'primitives') +MLBLOCKS_PIPELINES = os.path.join(os.path.dirname(__file__), 'pipelines') + + +def load_primitive(primitive, arguments=None): + arguments = arguments or dict() + + return MLBlock(primitive, **arguments) diff --git a/mlprimitives/adapters/pandas.py b/mlprimitives/adapters/pandas.py index 3f68bb0c..423ca458 100644 --- a/mlprimitives/adapters/pandas.py +++ b/mlprimitives/adapters/pandas.py @@ -1,4 +1,20 @@ -def resample(df, rule, time_index, groupby=None, aggregation='mean'): +import warnings + +from mlprimitives.utils import import_object + +_RESAMPLE_AGGS = [ + 'mean', + 'median', + 'prod', + 'quantile', + 'std', + 'sum', + 'var', +] + + +def resample(df, rule, on=None, groupby=(), aggregation='mean', + reset_index=True, time_index=None): """pd.DataFrame.resample adapter. Call the `df.resample` method on the given time_index @@ -10,22 +26,57 @@ def resample(df, rule, time_index, groupby=None, aggregation='mean'): If groupby option is used, the result is a multi-index datagrame. Args: - df (pandas.DataFrame): DataFrame to resample. - rule (str): The offset string or object representing target conversion. - groupby (list): Optional list of columns to group by. - time_index (str): Name of the column to use as the time index. - aggregation (str): Name of the aggregation function to use. + df (pandas.DataFrame): + DataFrame to resample. + rule (str or int): + The offset string or object representing target conversion or an + integer value that will be interpreted as the number of seconds. + on (str or None): + Name of the column to use as the time index. If ``None`` is given, the + DataFrame index is used. + groupby (list): + Optional list of columns to group by. + aggregation (callable or str): + Function or name of the function to use for the aggregation. If a name is given, it + can either be one of the standard pandas aggregation functions or the fully qualified + name of a python function that will be imported and used. + reset_index (boolt): + Whether to reset the index after aggregating + time_index (str or None): + Deprecated: This has been renamed to `on`. + Name of the column to use as the time index. If ``None`` is given, the + DataFrame is index is used. Returns: - pandas.Dataframe: resampled dataframe + pandas.Dataframe: + resampled dataframe """ + if on is None and time_index is not None: + message = ( + 'resample `time_series` argument deprecated and will be removed' + ' in future versions of MLPrimitives. Please use `on` instead.' + ) + warnings.warn(message, DeprecationWarning, stacklevel=2) + on = time_index + if groupby: df = df.groupby(groupby) - df = df.resample(rule, on=time_index) - df = getattr(df, aggregation)() - for column in groupby: - del df[column] + if isinstance(rule, int): + rule = '{}s'.format(rule) + + dtir = df.resample(rule, on=on) + + if not callable(aggregation) and aggregation not in _RESAMPLE_AGGS: + try: + aggregation = import_object(aggregation) + except (AttributeError, ImportError, ValueError): + pass + + df = dtir.aggregate(aggregation) + + if reset_index: + df.reset_index(inplace=True) return df diff --git a/mlprimitives/candidates/timeseries.py b/mlprimitives/candidates/timeseries.py deleted file mode 100644 index 68d1c6ef..00000000 --- a/mlprimitives/candidates/timeseries.py +++ /dev/null @@ -1,67 +0,0 @@ -import numpy as np -import pandas as pd - - -def rolling_window_sequences(X, window_size, target_size, value_column, time_column): - """ - Function that takes in a pandas.DataFrame and a window_size then creates - output arrays that correspond to a timeseries sequence with window_size overlap. - The output arrays can be fed into a timeseries forecasting model. - Assumes the input is timeseries sorted. - Args: - X (pandas.DataFrame): a pandas dataframe which has 'timestamp' - and 'value' columns, and is sorted based on timestamp. - The timestamp column is in UNIX format (in seconds). - window_size (int): number of values that overlap to create the sequence. - value_column (string): name of column that has the value field. - time_column (string): name of column that has the time field. - Returns: - (numpy.ndarray): contains the time series sequenced data with each - entry having window_size rows. - (numpy.ndarray): acts as the label for the forecasting problem with - each entry having window_size rows. - (numpy.ndarray): the corresponding timestamps series. - """ - output_X = [] - y = [] - time = [] - for start in range(len(X) - window_size - target_size): - end = start + window_size - output_X.append(X.iloc[start:end][value_column].values.reshape([-1, 1])) - y.append(X.iloc[end:end + target_size][value_column].values) - time.append(X.iloc[end + 1][time_column]) - - return np.asarray(output_X), np.asarray(y), np.asarray(time) - - -def time_segments_average(X, interval, value_column, time_column): - """ - function that aggregates data in a pandas dataframe by averaging over a given interval. - it starts averaging from the smallest timestamp in the dataframe and ends at the - largest timestamp. assumes the input is timeseries sorted. - args: - X (pandas.dataframe): a pandas dataframe which has 'timestamp' - and 'value' columns, and is sorted based on timestamp. the timestamp - column is in unix format (in seconds). - interval (int): an integer denoting the number of seconds - in the desired interval. - value_column (string): name of column that has the value field. - time_column (string): name of column that has the time field. - returns: - pandas.dataframe: a pandas dataframe with two colums - ('timestamp' and 'value'), where each `timestamp` is the starting time of - an interval and the `value` is the result of aggregation. - """ - start_ts = X[time_column].iloc[0] # min value - end_time = X[time_column].iloc[-1] # max value in dataframe - accepted_points = [] - while start_ts < end_time: - # average the values between start_ts, [start_ts + timedelta (e.g. 6hrs)] - upper_ts = start_ts + interval - mask = (X[time_column] > start_ts) & (X[time_column] <= upper_ts) - average_value = X.loc[mask][value_column].mean(skipna=True) - - accepted_points.append([start_ts, average_value]) - start_ts = upper_ts # update the timestamp - - return pd.DataFrame(accepted_points, columns=[time_column, value_column]) diff --git a/mlprimitives/candidates/timeseries_errors.py b/mlprimitives/candidates/timeseries_errors.py deleted file mode 100644 index e6465a6a..00000000 --- a/mlprimitives/candidates/timeseries_errors.py +++ /dev/null @@ -1,299 +0,0 @@ -import more_itertools as mit -import numpy as np - -# Methods to do dynamic error thresholding on timeseries data -# Implementation inspired by: https://arxiv.org/pdf/1802.04431.pdf - - -def get_forecast_errors(y_hat, - y_true, - window_size=5, - batch_size=30, - smoothing_percent=0.05, - smoothed=True): - """ - Calculates the forecasting error for two arrays of data. If smoothed errors desired, - runs EWMA. - Args: - y_hat (list): forecasted values. len(y_hat)==len(y_true). - y_true (list): true values. len(y_hat)==len(y_true). - window_size (int): - batch_size (int): - smoothing_percent (float): - smoothed (bool): whether the returned errors should be smoothed with EWMA. - Returns: - (list): error residuals. Smoothed if specified by user. - """ - errors = [abs(y_h - y_t) for y_h, y_t in zip(y_hat, y_true)] - - if not smoothed: - return errors - - historical_error_window = int(window_size * batch_size * smoothing_percent) - moving_avg = [] - for i in range(len(errors)): - left_window = i - historical_error_window - right_window = i + historical_error_window + 1 - if left_window < 0: - left_window = 0 - - if right_window > len(errors): - right_window = len(errors) - - moving_avg.append(np.mean(errors[left_window:right_window])) - - return moving_avg - - -def extract_anomalies(y_true, smoothed_errors, window_size, batch_size, error_buffer): - """ - Extracts anomalies from the errors. - Args: - y_true (): - smoothed_errors (): - window_size (int): - batch_size (int): - error_buffer (int): - Returns: - """ - if len(y_true) <= batch_size * window_size: - raise ValueError("Window size (%s) larger than y_true (len=%s)." - % (batch_size, len(y_true))) - - num_windows = int((len(y_true) - (batch_size * window_size)) / batch_size) - - anomalies_indices = [] - - for i in range(num_windows + 1): - prev_index = i * batch_size - curr_index = (window_size * batch_size) + (i * batch_size) - - if i == num_windows + 1: - curr_index = len(y_true) - - window_smoothed_errors = smoothed_errors[prev_index:curr_index] - window_y_true = y_true[prev_index:curr_index] - - epsilon, sd_threshold = compute_threshold(window_smoothed_errors, error_buffer) - - window_anom_indices = get_anomalies( - window_smoothed_errors, - window_y_true, - sd_threshold, - i, - anomalies_indices, - error_buffer - ) - - # get anomalies from inverse of smoothed errors - # This was done in the implementation of NASA paper but - # wasn't referenced in the paper - - # we get the inverse by flipping around the mean - mu = np.mean(window_smoothed_errors) - smoothed_errors_inv = [mu + (mu - e) for e in window_smoothed_errors] - epsilon_inv, sd_inv = compute_threshold(smoothed_errors_inv, error_buffer) - inv_anom_indices = get_anomalies( - smoothed_errors_inv, - window_y_true, - sd_inv, - i, - anomalies_indices, - len(y_true) - ) - - anomalies_indices = list(set(anomalies_indices + inv_anom_indices)) - - anomalies_indices.extend([i_a + i * batch_size for i_a in window_anom_indices]) - - # group anomalies - anomalies_indices = sorted(list(set(anomalies_indices))) - anomalies_groups = [list(group) for group in mit.consecutive_groups(anomalies_indices)] - anomaly_sequences = [(g[0], g[-1]) for g in anomalies_groups if not g[0] == g[-1]] - - # generate "scores" for anomalies based on the max distance from epsilon for each sequence - anomalies_scores = [] - for e_seq in anomaly_sequences: - denominator = np.mean(smoothed_errors) + np.std(smoothed_errors) - score = max([ - abs(smoothed_errors[x] - epsilon) / denominator - for x in range(e_seq[0], e_seq[1]) - ]) - - anomalies_scores.append(score) - - return anomaly_sequences, anomalies_scores - - -def compute_threshold(smoothed_errors, error_buffer, sd_limit=12.0): - """Helper method for `extract_anomalies` method. - Calculates the epsilon (threshold) for anomalies. - """ - mu = np.mean(smoothed_errors) - sigma = np.std(smoothed_errors) - - max_epsilon = 0 - sd_threshold = sd_limit - - # The treshold is determined dynamically by testing multiple Zs. - # z is drawn from an ordered set of positive values representing the - # number of standard deviations above mean(smoothed_errors) - - # here we iterate in increments of 0.5 on the range that the NASA paper found to be good - for z in np.arange(2.5, sd_limit, 0.5): - epsilon = mu + (sigma * z) - below_epsilon, below_indices, above_epsilon = [], [], [] - - for i in range(len(smoothed_errors)): - e = smoothed_errors[i] - if e < epsilon: - # save to compute delta mean and delta std - # these are important for epsilon calculation - below_epsilon.append(e) - below_indices.append(i) - - if e > epsilon: - # above_epsilon values are anomalies - for j in range(0, error_buffer): - if (i + j) not in above_epsilon and (i + j) < len(smoothed_errors): - above_epsilon.append(i + j) - - if (i - j) not in above_epsilon and (i - j) >= 0: - above_epsilon.append(i - j) - - if len(above_epsilon) == 0: - continue - - # generate sequences - above_epsilon = sorted(list(set(above_epsilon))) - groups = [list(group) for group in mit.consecutive_groups(above_epsilon)] - above_sequences = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] - - mean_perc_decrease = (mu - np.mean(below_epsilon)) / mu - sd_perc_decrease = (sigma - np.std(below_epsilon)) / sigma - epsilon = (mean_perc_decrease + sd_perc_decrease) /\ - (len(above_sequences)**2 + len(above_epsilon)) - - # update the largest epsilon we've seen so far - if epsilon > max_epsilon: - sd_threshold = z - max_epsilon = epsilon - - # sd_threshold can be multiplied by sigma to get epsilon - return max_epsilon, sd_threshold - - -def get_anomalies(smoothed_errors, y_true, z, window, all_anomalies, error_buffer): - """ - Helper method to get anomalies. - """ - - mu = np.mean(smoothed_errors) - sigma = np.std(smoothed_errors) - - epsilon = mu + (z * sigma) - - # compare to epsilon - errors_seq, anomaly_indices, max_error_below_e = group_consecutive_anomalies( - smoothed_errors, - epsilon, - y_true, - error_buffer, - window, - all_anomalies - ) - - if len(errors_seq) > 0: - anomaly_indices = prune_anomalies( - errors_seq, - smoothed_errors, - max_error_below_e, - anomaly_indices - ) - - return anomaly_indices - - -def group_consecutive_anomalies(smoothed_errors, - epsilon, - y_true, - error_buffer, - window, - all_anomalies, - batch_size=30): - upper_percentile, lower_percentile = np.percentile(y_true, [95, 5]) - accepted_range = upper_percentile - lower_percentile - - minimum_index = 100 # have a cutoff value for anomalies until model is trained enough - - anomaly_indices = [] - max_error_below_e = 0 - - for i in range(len(smoothed_errors)): - if smoothed_errors[i] <= epsilon or smoothed_errors[i] <= 0.05 * accepted_range: - # not an anomaly - continue - - for j in range(error_buffer): - if (i + j) < len(smoothed_errors) and (i + j) not in anomaly_indices: - if (i + j) > minimum_index: - anomaly_indices.append(i + j) - - if (i - j) < len(smoothed_errors) and (i - j) not in anomaly_indices: - if (i - j) > minimum_index: - anomaly_indices.append(i - j) - - # get all the errors that are below epsilon and which - # weren't identified as anomalies to process them - for i in range(len(smoothed_errors)): - adjusted_index = i + (window - 1) * batch_size - if smoothed_errors[i] > max_error_below_e and adjusted_index not in all_anomalies: - if i not in anomaly_indices: - max_error_below_e = smoothed_errors[i] - - # group anomalies into continuous sequences - anomaly_indices = sorted(list(set(anomaly_indices))) - groups = [list(group) for group in mit.consecutive_groups(anomaly_indices)] - e_seq = [(g[0], g[-1]) for g in groups if g[0] != g[-1]] - - return e_seq, anomaly_indices, max_error_below_e - - -def prune_anomalies(e_seq, smoothed_errors, max_error_below_e, anomaly_indices): - """ Helper method that removes anomalies which don't meet - a minimum separation from next anomaly. - """ - # min accepted perc decrease btwn max errors in anomalous sequences - MIN_PERCENT_DECREASE = 0.05 - e_seq_max, smoothed_errors_max = [], [] - - for error_seq in e_seq: - if len(smoothed_errors[error_seq[0]:error_seq[1]]) > 0: - sliced_errors = smoothed_errors[error_seq[0]:error_seq[1]] - e_seq_max.append(max(sliced_errors)) - smoothed_errors_max.append(max(sliced_errors)) - - smoothed_errors_max.sort(reverse=True) - - if max_error_below_e > 0: - smoothed_errors_max.append(max_error_below_e) - indices_remove = [] - - for i in range(len(smoothed_errors_max)): - if i < len(smoothed_errors_max) - 1: - delta = smoothed_errors_max[i] - smoothed_errors_max[i + 1] - perc_change = delta / smoothed_errors_max[i] - if perc_change < MIN_PERCENT_DECREASE: - indices_remove.append(e_seq_max.index(smoothed_errors_max[i])) - - for index in sorted(indices_remove, reverse=True): - del e_seq[index] - - pruned_indices = [] - - for i in anomaly_indices: - for error_seq in e_seq: - if i >= error_seq[0] and i <= error_seq[1]: - pruned_indices.append(i) - - return pruned_indices diff --git a/mlprimitives/cli.py b/mlprimitives/cli.py index e85a884e..73ee3b5a 100644 --- a/mlprimitives/cli.py +++ b/mlprimitives/cli.py @@ -32,7 +32,7 @@ def _logging_setup(verbosity=1): def _test(args): for pipeline in args.pipeline: print('Scoring pipeline: {}'.format(pipeline)) - score, stdev = score_pipeline(pipeline, args.splits) + score, stdev = score_pipeline(pipeline, args.splits, args.random_state, args.dataset) print('Obtained Score: {:.4f} +/- {:.4f}'.format(score, stdev)) @@ -78,6 +78,10 @@ def _get_parser(): subparser.set_defaults(action=_test) subparser.add_argument('-s', '--splits', default=1, type=int, help='Number of splits to use for Cross Validation') + subparser.add_argument('-r', '--random-state', default=0, type=int, + help='Random State to use for Cross Validation') + subparser.add_argument('-d', '--dataset', + help='Dataset to validate with.') subparser.add_argument('pipeline', nargs='+') subparser = subparsers.add_parser('list', help='List available primitives') diff --git a/mlprimitives/custom/feature_extraction.py b/mlprimitives/custom/feature_extraction.py index 36cd4e9b..6893121e 100644 --- a/mlprimitives/custom/feature_extraction.py +++ b/mlprimitives/custom/feature_extraction.py @@ -2,7 +2,6 @@ import logging -import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer @@ -58,30 +57,24 @@ def fit_transform(self, feature): class FeatureExtractor(object): """Single FeatureExtractor applied to multiple features.""" - def __init__(self, copy=True, features=None): + def __init__(self, copy=True, features=None, keep=False): self.copy = copy self.features = features or [] + self.keep = keep self._features = [] - @staticmethod - def detect_features(X): - features = [] - - for column in X.columns: - if not np.issubdtype(X[column].dtype, np.number): - features.append(column) - - return features - def _fit(self, x): pass + def _detect_feautres(self, X): + pass + def fit(self, X, y=None): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.features == 'auto': - self._features = self.detect_features(X) + self._features = self._detect_features(X) else: self._features = self.features @@ -100,7 +93,11 @@ def transform(self, X): for feature in self._features: LOGGER.debug("Extracting feature %s", feature) - x = X.pop(feature) + if self.keep: + x = X[feature] + else: + x = X.pop(feature) + extracted = self._transform(x) X = pd.concat([X, extracted], axis=1) @@ -111,12 +108,13 @@ def fit_transform(self, X, y=None): return self.transform(X) +def _is_str(x): + return isinstance(x, str) + + class CategoricalEncoder(FeatureExtractor): """Use the OneHotLabelEncoder only on categorical features. - NOTE: At the moment of this release, sklearn.preprocessing.data.CategoricalEncoder - has not been released yet, this is why we write our own version of it. - >>> df = pd.DataFrame([ ... {'a': 'a', 'b': 1, 'c': 1}, ... {'a': 'a', 'b': 2, 'c': 2}, @@ -130,10 +128,23 @@ class CategoricalEncoder(FeatureExtractor): 2 2 0 1 1 0 """ - def __init__(self, max_labels=None, **kwargs): + def __init__(self, max_labels=None, max_unique_ratio=1, **kwargs): self.max_labels = max_labels + self.max_unique_ratio = max_unique_ratio super(CategoricalEncoder, self).__init__(**kwargs) + def _detect_features(self, X): + features = list() + + for column in X.select_dtypes('object'): + x = X[column] + unique_ratio = len(x.unique()) / len(x) + if unique_ratio < self.max_unique_ratio: + if x.apply(_is_str).all(): + features.append(column) + + return features + def fit(self, X, y=None): self.encoders = dict() super(CategoricalEncoder, self).fit(X) @@ -151,9 +162,25 @@ def _transform(self, x): class StringVectorizer(FeatureExtractor): """Use the sklearn CountVectorizer only on string features.""" - def __init__(self, copy=True, features=None, **kwargs): + DTYPE = 'object' + + def __init__(self, copy=True, features=None, keep=False, min_words=3, **kwargs): self.kwargs = kwargs - super(StringVectorizer, self).__init__(copy, features) + self.min_words = min_words + super(StringVectorizer, self).__init__(copy, features, keep) + + def _detect_features(self, X): + features = [] + + analyzer = CountVectorizer(**self.kwargs).build_analyzer() + for column in X.select_dtypes('object'): + try: + if (X[column].apply(analyzer).str.len() >= self.min_words).any(): + features.append(column) + except (ValueError, AttributeError): + pass + + return features def fit(self, X, y=None): self.vectorizers = dict() @@ -174,19 +201,10 @@ def _transform(self, x): class DatetimeFeaturizer(FeatureExtractor): """Extract features from a datetime.""" - @staticmethod - def detect_features(X): - features = [] - for column in X.columns: - if np.issubdtype(X[column].dtype, np.datetime64): - features.append(column) - - return features + def _detect_features(self, X): + return list(X.select_dtypes('datetime').columns) def _transform(self, x): - if not np.issubdtype(x.dtype, np.datetime64): - x = pd.to_datetime(x) - prefix = x.name + '_' features = { prefix + 'year': x.dt.year, diff --git a/mlprimitives/custom/timeseries_anomalies.py b/mlprimitives/custom/timeseries_anomalies.py index 6d9af9e4..977b24b2 100644 --- a/mlprimitives/custom/timeseries_anomalies.py +++ b/mlprimitives/custom/timeseries_anomalies.py @@ -14,14 +14,20 @@ def regression_errors(y, y_hat, smoothing_window=0.01, smooth=True): If smooth is True, apply EWMA to the resulting array of errors. Args: - y (array): Ground truth. - y_hat (array): Predictions array. - smoothing_window (float): Size of the smoothing window, expressed as a proportion - of the total length of y. - smooth (bool): whether the returned errors should be smoothed with EWMA. + y (ndarray): + Ground truth. + y_hat (ndarray): + Predicted values. + smoothing_window (float): + Optional. Size of the smoothing window, expressed as a proportion of the total + length of y. If not given, 0.01 is used. + smooth (bool): + Optional. Indicates whether the returned errors should be smoothed with EWMA. + If not given, `True` is used. Returns: - (array): errors + ndarray: + Array of errors. """ errors = np.abs(y - y_hat)[:, 0] @@ -38,6 +44,22 @@ def deltas(errors, epsilon, mean, std): delta_mean = mean(errors) - mean(all errors below epsilon) delta_std = std(errors) - std(all errors below epsilon) + + Args: + errors (ndarray): + Array of errors. + epsilon (ndarray): + Threshold value. + mean (float): + Mean of errors. + std (float): + Standard deviation of errors. + + Returns: + float: + delta_mean. + float: + delta_std. """ below = errors[errors <= epsilon] if not len(below): @@ -52,6 +74,18 @@ def count_above(errors, epsilon): Continuous sequences are counted by shifting and counting the number of positions where there was a change and the original value was true, which means that a sequence started at that position. + + Args: + errors (ndarray): + Array of errors. + epsilon (ndarray): + Threshold value. + + Returns: + int: + Number of errors above epsilon. + int: + Number of continuous sequences above epsilon. """ above = errors > epsilon total_above = len(errors[above]) @@ -78,9 +112,22 @@ def z_cost(z, errors, mean, std): the better the `z`. In this case, we return this value inverted (we make it negative), to convert - it into a cost function, as later on we will use scipy to minimize it. - """ + it into a cost function, as later on we will use scipy.fmin to minimize it. + Args: + z (ndarray): + Value for which a cost score is calculated. + errors (ndarray): + Array of errors. + mean (float): + Mean of errors. + std (float): + Standard deviation of errors. + + Returns: + float: + Cost of z. + """ epsilon = mean + z * std delta_mean, delta_std = deltas(errors, epsilon, mean, std) @@ -95,12 +142,23 @@ def z_cost(z, errors, mean, std): return numerator / denominator -def _find_threshold(errors, z_range=(0, 10)): +def _find_threshold(errors, z_range): """Find the ideal threshold. - The ideal threshold is the one that minimizes the z_cost function. - """ + The ideal threshold is the one that minimizes the z_cost function. Scipy.fmin is used + to find the minimum, using the values from z_range as starting points. + Args: + errors (ndarray): + Array of errors. + z_range (list): + List of two values denoting the range out of which the start points for the + scipy.fmin function are chosen. + + Returns: + float: + Calculated threshold value. + """ mean = errors.mean() std = errors.std() @@ -116,18 +174,39 @@ def _find_threshold(errors, z_range=(0, 10)): return mean + best_z * std -def _find_sequences(errors, epsilon): +def _find_sequences(errors, epsilon, anomaly_padding): """Find sequences of values that are above epsilon. This is done following this steps: * create a boolean mask that indicates which values are above epsilon. - * shift this mask by one place, filing the empty gap with a False + * mark certain range of errors around True values with a True as well. + * shift this mask by one place, filing the empty gap with a False. * compare the shifted mask with the original one to see if there are changes. - * Consider a sequence start any point which was true and has changed - * Consider a sequence end any point which was false and has changed + * Consider a sequence start any point which was true and has changed. + * Consider a sequence end any point which was false and has changed. + + Args: + errors (ndarray): + Array of errors. + epsilon (float): + Threshold value. All errors above epsilon are considered an anomaly. + anomaly_padding (int): + Number of errors before and after a found anomaly that are added to the + anomalous sequence. + + Returns: + ndarray: + Array containing start, end of each found anomalous sequence. + float: + Maximum error value that was not considered an anomaly. """ above = pd.Series(errors > epsilon) + index_above = np.argwhere(above) + + for idx in index_above.flatten(): + above[max(0, idx - anomaly_padding):min(idx + anomaly_padding + 1, len(above))] = True + shift = above.shift(1).fillna(False) change = above != shift @@ -139,6 +218,7 @@ def _find_sequences(errors, epsilon): index = above.index starts = index[above & change].tolist() ends = (index[~above & change] - 1).tolist() + if len(ends) == len(starts) - 1: ends.append(len(above) - 1) @@ -151,8 +231,20 @@ def _get_max_errors(errors, sequences, max_below): Also add a row with the max error which was not considered anomalous. Table containing a ``max_error`` column with the maximum error of each - sequence and a column ``sequence`` with the corresponding start and stop - indexes, sorted descendingly by errors. + sequence and the columns ``start`` and ``stop`` with the corresponding start and stop + indexes, sorted descendingly by the maximum error. + + Args: + errors (ndarray): + Array of errors. + sequences (ndarray): + Array containing start, end of anomalous sequences + max_below (float): + Maximum error value that was not considered an anomaly. + + Returns: + pandas.DataFrame: + DataFrame object containing columns ``start``, ``stop`` and ``max_error``. """ max_errors = [{ 'max_error': max_below, @@ -170,7 +262,6 @@ def _get_max_errors(errors, sequences, max_below): }) max_errors = pd.DataFrame(max_errors).sort_values('max_error', ascending=False) - return max_errors.reset_index(drop=True) @@ -185,8 +276,18 @@ def _prune_anomalies(max_errors, min_percent): * Find rows which are below ``min_percent``. * Find the index of the latest of such rows. * Get the values of all the sequences above that index. - """ + Args: + max_errors (pandas.DataFrame): + DataFrame object containing columns ``start``, ``stop`` and ``max_error``. + min_percent (float): + Percentage of separation the anomalies need to meet between themselves and the + highest non-anomalous error in the window sequence. + + Returns: + ndarray: + Array containing start, end, max_error of the pruned anomalies. + """ next_error = max_errors['max_error'].shift(-1).iloc[:-1] max_error = max_errors['max_error'].iloc[:-1] @@ -198,68 +299,189 @@ def _prune_anomalies(max_errors, min_percent): else: last_index = max_error[~too_small].index[-1] - return max_errors[['start', 'stop']].iloc[0: last_index + 1].values + return max_errors[['start', 'stop', 'max_error']].iloc[0: last_index + 1].values + +def _compute_scores(pruned_anomalies, errors, threshold, window_start): + """Compute the score of the anomalies. -def _merge_consecutive(sequences): - """Merge consecutive sequences. + Calculate the score of the anomalies proportional to the maximum error in the sequence + and add window_start timestamp to make the index absolute. + + Args: + pruned_anomalies (ndarray): + Array of anomalies containing the start, end and max_error for all anomalies in + the window. + errors (ndarray): + Array of errors. + threshold (float): + Threshold value. + window_start (int): + Index of the first error value in the window. - We iterate over a list of start, end pairs and merge together - the cases where the start of a sequence is exactly the end - of the previous sequence + 1. + Returns: + list: + List of anomalies containing start-index, end-index, score for each anomaly. """ - previous = -2 - new_sequences = list() - for start, end in sequences: - if previous + 1 == start: - new_sequences[-1][1] = end - else: - new_sequences.append([start, end]) + anomalies = list() + denominator = errors.mean() + errors.std() + + for row in pruned_anomalies: + max_error = row[2] + score = (max_error - threshold) / denominator + anomalies.append([row[0] + window_start, row[1] + window_start, score]) + + return anomalies + - previous = end +def _merge_sequences(sequences): + """Merge consecutive and overlapping sequences. + + We iterate over a list of start, end, score triples and merge together + overlapping or consecutive sequences. + The score of a merged sequence is the average of the single scores, + weighted by the length of the corresponding sequences. + + Args: + sequences (list): + List of anomalies, containing start-index, end-index, score for each anomaly. + + Returns: + ndarray: + Array containing start-index, end-index, score for each anomaly after merging. + """ + if len(sequences) == 0: + return np.array([]) + + sorted_sequences = sorted(sequences, key=lambda entry: entry[0]) + new_sequences = [sorted_sequences[0]] + score = [sorted_sequences[0][2]] + weights = [sorted_sequences[0][1] - sorted_sequences[0][0]] + + for sequence in sorted_sequences[1:]: + prev_sequence = new_sequences[-1] + + if sequence[0] <= prev_sequence[1] + 1: + score.append(sequence[2]) + weights.append(sequence[1] - sequence[0]) + weighted_average = np.average(score, weights=weights) + new_sequences[-1] = (prev_sequence[0], max(prev_sequence[1], sequence[1]), + weighted_average) + else: + score = [sequence[2]] + weights = [sequence[1] - sequence[0]] + new_sequences.append(sequence) return np.array(new_sequences) -def find_anomalies(errors, index, z_range=(0, 10), window_size=None, min_percent=0.1): +def _find_window_sequences(window, z_range, anomaly_padding, min_percent, window_start): """Find sequences of values that are anomalous. - We first find the ideal threshold for the set of errors that we have, - and then find the sequences of values that are above this threshold. + We first find the threshold for the window, then find all sequences above that threshold. + After that, we get the max errors of the sequences and prune the anomalies. Lastly, the + score of the anomalies is computed. - Lastly, we compute a score proportional to the maximum error in the - sequence, and finally return the index pairs that correspond to - each sequence, along with its score. + Args: + window (ndarray): + Array of errors in the window that is analyzed. + z_range (list): + List of two values denoting the range out of which the start points for the + dynamic find_threshold function are chosen. + anomaly_padding (int): + Number of errors before and after a found anomaly that are added to the anomalous + sequence. + min_percent (float): + Percentage of separation the anomalies need to meet between themselves and the + highest non-anomalous error in the window sequence. + window_start (int): + Index of the first error value in the window. + + Returns: + ndarray: + Array containing the start-index, end-index, score for each anomalous sequence + that was found in the window. + """ + + threshold = _find_threshold(window, z_range) + window_sequences, max_below = _find_sequences(window, threshold, anomaly_padding) + max_errors = _get_max_errors(window, window_sequences, max_below) + pruned_anomalies = _prune_anomalies(max_errors, min_percent) + window_sequences = _compute_scores(pruned_anomalies, window, threshold, window_start) + + return window_sequences + + +def find_anomalies(errors, index, z_range=(0, 10), window_size=None, window_step_size=None, + min_percent=0.1, anomaly_padding=50, lower_threshold=False): + """Find sequences of error values that are anomalous. + + We first define the window of errors, that we want to analyze. We then find the anomalous + sequences in that window and store the start/stop index pairs that correspond to each + sequence, along with its score. Optionally, we can flip the error sequence around the mean + and apply the same procedure, allowing us to find unusually low error sequences. + We then move the window and repeat the procedure. + Lastly, we combine overlapping or consecutive sequences. + + Args: + errors (ndarray): + Array of errors. + index (ndarray): + Array of indices of the errors. + z_range (list): + Optional. List of two values denoting the range out of which the start points for + the scipy.fmin function are chosen. If not given, (0, 10) is used. + window_size (int): + Optional. Size of the window for which a threshold is calculated. If not given, + `None` is used, which finds one threshold for the entire sequence of errors. + window_step_size (int): + Optional. Number of steps the window is moved before another threshold is + calculated for the new window. + min_percent (float): + Optional. Percentage of separation the anomalies need to meet between themselves and + the highest non-anomalous error in the window sequence. It nof given, 0.1 is used. + anomaly_padding (int): + Optional. Number of errors before and after a found anomaly that are added to the + anomalous sequence. If not given, 50 is used. + lower_threshold (bool): + Optional. Indicates whether to apply a lower threshold to find unusually low errors. + If not given, `False` is used. + + Returns: + ndarray: + Array containing start-index, end-index, score for each anomalous sequence that + was found. """ window_size = window_size or len(errors) + window_step_size = window_step_size or window_size window_start = 0 + window_end = 0 sequences = list() - while window_start < len(errors): + + while window_end < len(errors): window_end = window_start + window_size window = errors[window_start:window_end] - - threshold = _find_threshold(window, z_range) - window_sequences, max_below = _find_sequences(window, threshold) - - max_errors = _get_max_errors(window, window_sequences, max_below) - window_sequences = _prune_anomalies(max_errors, min_percent) - - # indexes are relative to each window, so we need to add - # the window_start to all of them to make them absolute - window_sequences += window_start - + window_sequences = _find_window_sequences(window, z_range, anomaly_padding, min_percent, + window_start) sequences.extend(window_sequences) - window_start = window_end + if lower_threshold: + # Flip errors sequence around mean + mean = window.mean() + inverted_window = mean - (window - mean) + inverted_window_sequences = _find_window_sequences(inverted_window, z_range, + anomaly_padding, min_percent, + window_start) + sequences.extend(inverted_window_sequences) - sequences = _merge_consecutive(sequences) + window_start = window_start + window_step_size + + sequences = _merge_sequences(sequences) anomalies = list() - denominator = errors.mean() + errors.std() - for start, stop in sequences: - max_error = errors[start:stop + 1].max() - score = (max_error - threshold) / denominator - anomalies.append([index[start], index[stop], score]) + + for start, stop, score in sequences: + anomalies.append([index[int(start)], index[int(stop)], score]) return np.asarray(anomalies) diff --git a/mlprimitives/datasets.py b/mlprimitives/datasets.py index f582b27e..aa7d4bd6 100644 --- a/mlprimitives/datasets.py +++ b/mlprimitives/datasets.py @@ -146,7 +146,7 @@ def _get_split(data, index): else: return data[index] - def get_splits(self, n_splits=1): + def get_splits(self, n_splits=1, random_state=0): """Return splits of this dataset ready for Cross Validation. If n_splits is 1, a tuple containing the X for train and test @@ -171,12 +171,13 @@ def get_splits(self, n_splits=1): self.data, self.target, shuffle=self._shuffle, - stratify=stratify + stratify=stratify, + random_state=random_state ) else: cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle) + cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state) splits = list() for train, test in cv.split(self.data, self.target): diff --git a/mlprimitives/evaluation.py b/mlprimitives/evaluation.py index f13e998c..d50e581f 100644 --- a/mlprimitives/evaluation.py +++ b/mlprimitives/evaluation.py @@ -20,21 +20,6 @@ LOGGER = logging.getLogger(__name__) -def build_pipeline(pipeline_spec): - pipeline = MLPipeline( - pipeline_spec['primitives'], - pipeline_spec.get('init_params', dict()), - pipeline_spec.get('input_names', dict()), - pipeline_spec.get('output_names', dict()), - ) - - hyperparameters = pipeline_spec.get('hyperparameters') - if hyperparameters: - pipeline.set_hyperparameters(hyperparameters) - - return pipeline - - def get_value(dataset, value): if isinstance(value, str) and value.startswith('$'): value = getattr(dataset, value[1:]) @@ -65,14 +50,16 @@ def scorer(obs, exp): return scorer -def score_pipeline(pipeline_metadata, n_splits=5): +def score_pipeline(pipeline_metadata, n_splits=5, random_state=0, dataset=None): if isinstance(pipeline_metadata, str): LOGGER.info('Loading pipeline %s', pipeline_metadata) with open(pipeline_metadata, 'r') as pipeline_file: pipeline_metadata = json.load(pipeline_file) validation = pipeline_metadata['validation'] - dataset = validation['dataset'] + if dataset is None: + dataset = validation['dataset'] + LOGGER.info('Loading dataset %s', dataset) dataset = load_dataset(dataset) metric = validation.get('metric') @@ -84,14 +71,14 @@ def score_pipeline(pipeline_metadata, n_splits=5): metric = dataset.metric scores = list() - splits = dataset.get_splits(n_splits) + splits = dataset.get_splits(n_splits, random_state) if n_splits == 1: splits = [splits] for split, (X_train, X_test, y_train, y_test) in enumerate(splits): LOGGER.info('Scoring split %s', split + 1) context = get_context(dataset, validation.get('context', dict())) - pipeline = build_pipeline(pipeline_metadata) + pipeline = MLPipeline.from_dict(pipeline_metadata) pipeline.fit(X_train, y_train, **context) predictions = pipeline.predict(X_test, **context) diff --git a/mlprimitives/jsons/mlprimitives.candidates.timeseries.rolling_window_sequences.json b/mlprimitives/jsons/mlprimitives.candidates.timeseries.rolling_window_sequences.json deleted file mode 100644 index 7d4813e1..00000000 --- a/mlprimitives/jsons/mlprimitives.candidates.timeseries.rolling_window_sequences.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "name": "mlprimitives.candidates.timeseries.rolling_window_sequences", - "contributors": [ - "Ihssan Tinawi " - ], - "description": "mlprimitives.candidates.timeseries.rolling_window_sequences", - "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" - }, - "modalities": [ - "timeseries" - ], - "primitive": "mlprimitives.candidates.timeseries.rolling_window_sequences", - "produce": { - "args": [ - { - "name": "X", - "type": "pandas.DataFrame" - } - ], - "output": [ - { - "name": "X", - "type": "ndarray" - }, - { - "name": "y", - "type": "ndarray" - }, - { - "name": "time", - "type": "ndarray" - } - ] - }, - "hyperparameters": { - "fixed": { - "window_size": { - "type": "int", - "default": 50 - }, - "target_size": { - "type": "int", - "default": 1 - }, - "time_column": { - "type": "str" - }, - "value_column": { - "type": "str" - } - } - } -} diff --git a/mlprimitives/jsons/mlprimitives.candidates.timeseries.time_segments_average.json b/mlprimitives/jsons/mlprimitives.candidates.timeseries.time_segments_average.json deleted file mode 100644 index 4f0341d7..00000000 --- a/mlprimitives/jsons/mlprimitives.candidates.timeseries.time_segments_average.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "name": "mlprimitives.candidates.timeseries.time_segments_average", - "contributors": [ - "Ihssan Tinawi " - ], - "description": "mlprimitives.candidates.timeseries.time_segments_average", - "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" - }, - "modalities": [ - "timeseries" - ], - "primitive": "mlprimitives.candidates.timeseries.time_segments_average", - "produce": { - "args": [ - { - "name": "X", - "type": "pandas.DataFrame" - } - ], - "output": [ - { - "name": "X", - "type": "pandas.DataFrame" - } - ] - }, - "hyperparameters": { - "fixed": { - "interval": { - "type": "int", - "default": 3600 - }, - "value_column": { - "type": "str" - }, - "time_column": { - "type": "str" - } - }, - "tunable": {} - } -} diff --git a/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.extract_anomalies.json b/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.extract_anomalies.json deleted file mode 100644 index d95e7265..00000000 --- a/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.extract_anomalies.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "name": "mlprimitives.candidates.timeseries_errors.extract_anomalies", - "contributors": [ - "Ihssan Tinawi " - ], - "description": "mlprimitives.candidates.timeseries_errors.extract_anomalies", - "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" - }, - "modalities": [ - "timeseries" - ], - "primitive": "mlprimitives.candidates.timeseries_errors.extract_anomalies", - "produce": { - "args": [ - { - "name": "y_true", - "type": "np.array" - }, - { - "name": "smoothed_errors", - "type": "list" - } - ], - "output": [ - { - "name": "anomaly_sequences", - "type": "list" - }, - { - "name": "anomalies_scores", - "type": "list" - } - ] - }, - "hyperparameters": { - "tunable": { - "window_size": { - "type": "int", - "default": 50 - }, - "batch_size": { - "type": "int", - "default": 200 - }, - "error_buffer": { - "type": "int", - "default": 10 - } - }, - "fixed": {} - } -} diff --git a/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.get_forecast_errors.json b/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.get_forecast_errors.json deleted file mode 100644 index 36325b25..00000000 --- a/mlprimitives/jsons/mlprimitives.candidates.timeseries_errors.get_forecast_errors.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "name": "mlprimitives.candidates.timeseries_errors.get_forecast_errors", - "contributors": [ - "Ihssan Tinawi " - ], - "description": "mlprimitives.candidates.timeseries_errors.get_forecast_errors", - "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" - }, - "modalities": [ - "timeseries" - ], - "primitive": "mlprimitives.candidates.timeseries_errors.get_forecast_errors", - "produce": { - "args": [ - { - "name": "y_hat", - "type": "np.array" - }, - { - "name": "y_true", - "type": "np.array" - } - ], - "output": [ - { - "name": "moving_avg", - "type": "list" - } - ] - }, - "hyperparameters": { - "tunable": { - "window_size": { - "type": "int", - "default": 5 - }, - "batch_size": { - "type": "int", - "default": 30 - }, - "smoothing_percent": { - "type": "float", - "default": 0.05 - } - }, - "fixed": { - "smoothed": { - "type": "bool", - "default": true - } - } - } -} diff --git a/pipelines/graph.graph_matching.nx.xgb.json b/mlprimitives/pipelines/graph.graph_matching.nx.xgb.json similarity index 100% rename from pipelines/graph.graph_matching.nx.xgb.json rename to mlprimitives/pipelines/graph.graph_matching.nx.xgb.json diff --git a/pipelines/graph.link_prediction.nx.xgb.json b/mlprimitives/pipelines/graph.link_prediction.nx.xgb.json similarity index 100% rename from pipelines/graph.link_prediction.nx.xgb.json rename to mlprimitives/pipelines/graph.link_prediction.nx.xgb.json diff --git a/pipelines/image.classification.hog.rf.json b/mlprimitives/pipelines/image.classification.hog.rf.json similarity index 100% rename from pipelines/image.classification.hog.rf.json rename to mlprimitives/pipelines/image.classification.hog.rf.json diff --git a/pipelines/image.classification.hog.xgb.json b/mlprimitives/pipelines/image.classification.hog.xgb.json similarity index 100% rename from pipelines/image.classification.hog.xgb.json rename to mlprimitives/pipelines/image.classification.hog.xgb.json diff --git a/pipelines/image.classification.resnet50.xgb.json b/mlprimitives/pipelines/image.classification.resnet50.xgb.json similarity index 100% rename from pipelines/image.classification.resnet50.xgb.json rename to mlprimitives/pipelines/image.classification.resnet50.xgb.json diff --git a/pipelines/image.regression.hog.rf.json b/mlprimitives/pipelines/image.regression.hog.rf.json similarity index 100% rename from pipelines/image.regression.hog.rf.json rename to mlprimitives/pipelines/image.regression.hog.rf.json diff --git a/pipelines/image.regression.hog.xgb.json b/mlprimitives/pipelines/image.regression.hog.xgb.json similarity index 100% rename from pipelines/image.regression.hog.xgb.json rename to mlprimitives/pipelines/image.regression.hog.xgb.json diff --git a/pipelines/image.regression.resnet50.xgb.json b/mlprimitives/pipelines/image.regression.resnet50.xgb.json similarity index 100% rename from pipelines/image.regression.resnet50.xgb.json rename to mlprimitives/pipelines/image.regression.resnet50.xgb.json diff --git a/pipelines/keras.Sequential.LSTMBinaryTextClassifier.json b/mlprimitives/pipelines/keras.Sequential.LSTMBinaryTextClassifier.json similarity index 100% rename from pipelines/keras.Sequential.LSTMBinaryTextClassifier.json rename to mlprimitives/pipelines/keras.Sequential.LSTMBinaryTextClassifier.json diff --git a/pipelines/keras.Sequential.LSTMTextClassifier.json b/mlprimitives/pipelines/keras.Sequential.LSTMTextClassifier.json similarity index 100% rename from pipelines/keras.Sequential.LSTMTextClassifier.json rename to mlprimitives/pipelines/keras.Sequential.LSTMTextClassifier.json diff --git a/pipelines/keras.Sequential.MLPBinaryClassifier.json b/mlprimitives/pipelines/keras.Sequential.MLPBinaryClassifier.json similarity index 100% rename from pipelines/keras.Sequential.MLPBinaryClassifier.json rename to mlprimitives/pipelines/keras.Sequential.MLPBinaryClassifier.json diff --git a/pipelines/keras.Sequential.MLPMultiClassClassifier.json b/mlprimitives/pipelines/keras.Sequential.MLPMultiClassClassifier.json similarity index 100% rename from pipelines/keras.Sequential.MLPMultiClassClassifier.json rename to mlprimitives/pipelines/keras.Sequential.MLPMultiClassClassifier.json diff --git a/pipelines/keras.Sequential.SingleLayerCNNImageClassifier.json b/mlprimitives/pipelines/keras.Sequential.SingleLayerCNNImageClassifier.json similarity index 100% rename from pipelines/keras.Sequential.SingleLayerCNNImageClassifier.json rename to mlprimitives/pipelines/keras.Sequential.SingleLayerCNNImageClassifier.json diff --git a/pipelines/keras.Sequential.SingleLayerCNNImageRegressor.json b/mlprimitives/pipelines/keras.Sequential.SingleLayerCNNImageRegressor.json similarity index 100% rename from pipelines/keras.Sequential.SingleLayerCNNImageRegressor.json rename to mlprimitives/pipelines/keras.Sequential.SingleLayerCNNImageRegressor.json diff --git a/pipelines/keras.Sequential.VGGCNNClassifier.json b/mlprimitives/pipelines/keras.Sequential.VGGCNNClassifier.json similarity index 100% rename from pipelines/keras.Sequential.VGGCNNClassifier.json rename to mlprimitives/pipelines/keras.Sequential.VGGCNNClassifier.json diff --git a/pipelines/mlprimitives.custom.feature_extraction.CategoricalEncoder.json b/mlprimitives/pipelines/mlprimitives.custom.feature_extraction.CategoricalEncoder.json similarity index 100% rename from pipelines/mlprimitives.custom.feature_extraction.CategoricalEncoder.json rename to mlprimitives/pipelines/mlprimitives.custom.feature_extraction.CategoricalEncoder.json diff --git a/pipelines/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json b/mlprimitives/pipelines/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json similarity index 100% rename from pipelines/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json rename to mlprimitives/pipelines/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json diff --git a/pipelines/mlprimitives.custom.feature_extraction.StringVectorizer.json b/mlprimitives/pipelines/mlprimitives.custom.feature_extraction.StringVectorizer.json similarity index 100% rename from pipelines/mlprimitives.custom.feature_extraction.StringVectorizer.json rename to mlprimitives/pipelines/mlprimitives.custom.feature_extraction.StringVectorizer.json diff --git a/pipelines/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json b/mlprimitives/pipelines/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json similarity index 100% rename from pipelines/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json rename to mlprimitives/pipelines/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json diff --git a/pipelines/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json b/mlprimitives/pipelines/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json similarity index 100% rename from pipelines/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json rename to mlprimitives/pipelines/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json diff --git a/pipelines/mlprimitives.custom.text.TextCleaner.json b/mlprimitives/pipelines/mlprimitives.custom.text.TextCleaner.json similarity index 100% rename from pipelines/mlprimitives.custom.text.TextCleaner.json rename to mlprimitives/pipelines/mlprimitives.custom.text.TextCleaner.json diff --git a/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mean.json b/mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mean.json similarity index 100% rename from pipelines/mlprimitives.custom.trivial.TrivialPredictor.mean.json rename to mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mean.json diff --git a/pipelines/mlprimitives.custom.trivial.TrivialPredictor.median.json b/mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.median.json similarity index 100% rename from pipelines/mlprimitives.custom.trivial.TrivialPredictor.median.json rename to mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.median.json diff --git a/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mode.json b/mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mode.json similarity index 100% rename from pipelines/mlprimitives.custom.trivial.TrivialPredictor.mode.json rename to mlprimitives/pipelines/mlprimitives.custom.trivial.TrivialPredictor.mode.json diff --git a/pipelines/multi_table.classification.dfs.xgb.json b/mlprimitives/pipelines/multi_table.classification.dfs.xgb.json similarity index 100% rename from pipelines/multi_table.classification.dfs.xgb.json rename to mlprimitives/pipelines/multi_table.classification.dfs.xgb.json diff --git a/mlprimitives/pipelines/single_table.classification.json b/mlprimitives/pipelines/single_table.classification.json new file mode 100644 index 00000000..ddc301f2 --- /dev/null +++ b/mlprimitives/pipelines/single_table.classification.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "single_table.classification", + "data_type": "single_table", + "task_type": "classification" + }, + "validation": { + "dataset": "personae" + }, + "primitives": [ + "mlprimitives.custom.preprocessing.ClassEncoder", + "mlprimitives.custom.feature_extraction.DatetimeFeaturizer", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "mlprimitives.custom.feature_extraction.StringVectorizer", + "sklearn.impute.SimpleImputer", + "xgboost.XGBClassifier", + "mlprimitives.custom.preprocessing.ClassDecoder" + ] +} diff --git a/pipelines/single_table.classification.text.json b/mlprimitives/pipelines/single_table.classification.text.json similarity index 100% rename from pipelines/single_table.classification.text.json rename to mlprimitives/pipelines/single_table.classification.text.json diff --git a/pipelines/single_table.classification.xgb.json b/mlprimitives/pipelines/single_table.classification.xgb.json similarity index 100% rename from pipelines/single_table.classification.xgb.json rename to mlprimitives/pipelines/single_table.classification.xgb.json diff --git a/pipelines/single_table.regression.xgb.json b/mlprimitives/pipelines/single_table.regression.xgb.json similarity index 100% rename from pipelines/single_table.regression.xgb.json rename to mlprimitives/pipelines/single_table.regression.xgb.json diff --git a/pipelines/sklearn.decomposition.DictionaryLearning.json b/mlprimitives/pipelines/sklearn.decomposition.DictionaryLearning.json similarity index 100% rename from pipelines/sklearn.decomposition.DictionaryLearning.json rename to mlprimitives/pipelines/sklearn.decomposition.DictionaryLearning.json diff --git a/pipelines/sklearn.decomposition.FactorAnalysis.json b/mlprimitives/pipelines/sklearn.decomposition.FactorAnalysis.json similarity index 100% rename from pipelines/sklearn.decomposition.FactorAnalysis.json rename to mlprimitives/pipelines/sklearn.decomposition.FactorAnalysis.json diff --git a/pipelines/sklearn.decomposition.FastICA.json b/mlprimitives/pipelines/sklearn.decomposition.FastICA.json similarity index 100% rename from pipelines/sklearn.decomposition.FastICA.json rename to mlprimitives/pipelines/sklearn.decomposition.FastICA.json diff --git a/pipelines/sklearn.decomposition.KernelPCA.json b/mlprimitives/pipelines/sklearn.decomposition.KernelPCA.json similarity index 100% rename from pipelines/sklearn.decomposition.KernelPCA.json rename to mlprimitives/pipelines/sklearn.decomposition.KernelPCA.json diff --git a/pipelines/sklearn.decomposition.PCA.json b/mlprimitives/pipelines/sklearn.decomposition.PCA.json similarity index 100% rename from pipelines/sklearn.decomposition.PCA.json rename to mlprimitives/pipelines/sklearn.decomposition.PCA.json diff --git a/pipelines/sklearn.decomposition.TruncatedSVD.json b/mlprimitives/pipelines/sklearn.decomposition.TruncatedSVD.json similarity index 100% rename from pipelines/sklearn.decomposition.TruncatedSVD.json rename to mlprimitives/pipelines/sklearn.decomposition.TruncatedSVD.json diff --git a/pipelines/sklearn.ensemble.AdaBoostClassifier.json b/mlprimitives/pipelines/sklearn.ensemble.AdaBoostClassifier.json similarity index 100% rename from pipelines/sklearn.ensemble.AdaBoostClassifier.json rename to mlprimitives/pipelines/sklearn.ensemble.AdaBoostClassifier.json diff --git a/pipelines/sklearn.ensemble.AdaBoostRegressor.json b/mlprimitives/pipelines/sklearn.ensemble.AdaBoostRegressor.json similarity index 100% rename from pipelines/sklearn.ensemble.AdaBoostRegressor.json rename to mlprimitives/pipelines/sklearn.ensemble.AdaBoostRegressor.json diff --git a/pipelines/sklearn.ensemble.BaggingClassifier.json b/mlprimitives/pipelines/sklearn.ensemble.BaggingClassifier.json similarity index 100% rename from pipelines/sklearn.ensemble.BaggingClassifier.json rename to mlprimitives/pipelines/sklearn.ensemble.BaggingClassifier.json diff --git a/pipelines/sklearn.ensemble.BaggingRegressor.json b/mlprimitives/pipelines/sklearn.ensemble.BaggingRegressor.json similarity index 100% rename from pipelines/sklearn.ensemble.BaggingRegressor.json rename to mlprimitives/pipelines/sklearn.ensemble.BaggingRegressor.json diff --git a/pipelines/sklearn.ensemble.ExtraTreesClassifier.json b/mlprimitives/pipelines/sklearn.ensemble.ExtraTreesClassifier.json similarity index 100% rename from pipelines/sklearn.ensemble.ExtraTreesClassifier.json rename to mlprimitives/pipelines/sklearn.ensemble.ExtraTreesClassifier.json diff --git a/pipelines/sklearn.ensemble.ExtraTreesRegressor.json b/mlprimitives/pipelines/sklearn.ensemble.ExtraTreesRegressor.json similarity index 100% rename from pipelines/sklearn.ensemble.ExtraTreesRegressor.json rename to mlprimitives/pipelines/sklearn.ensemble.ExtraTreesRegressor.json diff --git a/pipelines/sklearn.ensemble.GradientBoostingClassifier.json b/mlprimitives/pipelines/sklearn.ensemble.GradientBoostingClassifier.json similarity index 100% rename from pipelines/sklearn.ensemble.GradientBoostingClassifier.json rename to mlprimitives/pipelines/sklearn.ensemble.GradientBoostingClassifier.json diff --git a/pipelines/sklearn.ensemble.GradientBoostingRegressor.json b/mlprimitives/pipelines/sklearn.ensemble.GradientBoostingRegressor.json similarity index 100% rename from pipelines/sklearn.ensemble.GradientBoostingRegressor.json rename to mlprimitives/pipelines/sklearn.ensemble.GradientBoostingRegressor.json diff --git a/pipelines/sklearn.ensemble.IsolationForest.json b/mlprimitives/pipelines/sklearn.ensemble.IsolationForest.json similarity index 100% rename from pipelines/sklearn.ensemble.IsolationForest.json rename to mlprimitives/pipelines/sklearn.ensemble.IsolationForest.json diff --git a/pipelines/sklearn.ensemble.RandomForestClassifier.json b/mlprimitives/pipelines/sklearn.ensemble.RandomForestClassifier.json similarity index 100% rename from pipelines/sklearn.ensemble.RandomForestClassifier.json rename to mlprimitives/pipelines/sklearn.ensemble.RandomForestClassifier.json diff --git a/pipelines/sklearn.ensemble.RandomForestRegressor.json b/mlprimitives/pipelines/sklearn.ensemble.RandomForestRegressor.json similarity index 100% rename from pipelines/sklearn.ensemble.RandomForestRegressor.json rename to mlprimitives/pipelines/sklearn.ensemble.RandomForestRegressor.json diff --git a/pipelines/sklearn.ensemble.RandomTreesEmbedding.json b/mlprimitives/pipelines/sklearn.ensemble.RandomTreesEmbedding.json similarity index 100% rename from pipelines/sklearn.ensemble.RandomTreesEmbedding.json rename to mlprimitives/pipelines/sklearn.ensemble.RandomTreesEmbedding.json diff --git a/pipelines/sklearn.impute.SimpleImputer.json b/mlprimitives/pipelines/sklearn.impute.SimpleImputer.json similarity index 100% rename from pipelines/sklearn.impute.SimpleImputer.json rename to mlprimitives/pipelines/sklearn.impute.SimpleImputer.json diff --git a/pipelines/sklearn.linear_model.ElasticNet.json b/mlprimitives/pipelines/sklearn.linear_model.ElasticNet.json similarity index 100% rename from pipelines/sklearn.linear_model.ElasticNet.json rename to mlprimitives/pipelines/sklearn.linear_model.ElasticNet.json diff --git a/pipelines/sklearn.linear_model.Lars.json b/mlprimitives/pipelines/sklearn.linear_model.Lars.json similarity index 100% rename from pipelines/sklearn.linear_model.Lars.json rename to mlprimitives/pipelines/sklearn.linear_model.Lars.json diff --git a/pipelines/sklearn.linear_model.Lasso.json b/mlprimitives/pipelines/sklearn.linear_model.Lasso.json similarity index 100% rename from pipelines/sklearn.linear_model.Lasso.json rename to mlprimitives/pipelines/sklearn.linear_model.Lasso.json diff --git a/pipelines/sklearn.linear_model.LinearRegression.json b/mlprimitives/pipelines/sklearn.linear_model.LinearRegression.json similarity index 100% rename from pipelines/sklearn.linear_model.LinearRegression.json rename to mlprimitives/pipelines/sklearn.linear_model.LinearRegression.json diff --git a/pipelines/sklearn.linear_model.LogisticRegression.json b/mlprimitives/pipelines/sklearn.linear_model.LogisticRegression.json similarity index 100% rename from pipelines/sklearn.linear_model.LogisticRegression.json rename to mlprimitives/pipelines/sklearn.linear_model.LogisticRegression.json diff --git a/pipelines/sklearn.linear_model.MultiTaskLasso.json b/mlprimitives/pipelines/sklearn.linear_model.MultiTaskLasso.json similarity index 100% rename from pipelines/sklearn.linear_model.MultiTaskLasso.json rename to mlprimitives/pipelines/sklearn.linear_model.MultiTaskLasso.json diff --git a/pipelines/sklearn.linear_model.Ridge.json b/mlprimitives/pipelines/sklearn.linear_model.Ridge.json similarity index 100% rename from pipelines/sklearn.linear_model.Ridge.json rename to mlprimitives/pipelines/sklearn.linear_model.Ridge.json diff --git a/pipelines/sklearn.preprocessing.MaxAbsScaler.json b/mlprimitives/pipelines/sklearn.preprocessing.MaxAbsScaler.json similarity index 100% rename from pipelines/sklearn.preprocessing.MaxAbsScaler.json rename to mlprimitives/pipelines/sklearn.preprocessing.MaxAbsScaler.json diff --git a/pipelines/sklearn.preprocessing.MinMaxScaler.json b/mlprimitives/pipelines/sklearn.preprocessing.MinMaxScaler.json similarity index 100% rename from pipelines/sklearn.preprocessing.MinMaxScaler.json rename to mlprimitives/pipelines/sklearn.preprocessing.MinMaxScaler.json diff --git a/pipelines/sklearn.preprocessing.RobustScaler.json b/mlprimitives/pipelines/sklearn.preprocessing.RobustScaler.json similarity index 100% rename from pipelines/sklearn.preprocessing.RobustScaler.json rename to mlprimitives/pipelines/sklearn.preprocessing.RobustScaler.json diff --git a/pipelines/text.classification.lstm.json b/mlprimitives/pipelines/text.classification.lstm.json similarity index 100% rename from pipelines/text.classification.lstm.json rename to mlprimitives/pipelines/text.classification.lstm.json diff --git a/mlprimitives/jsons/community.CommunityBestPartition.json b/mlprimitives/primitives/community.CommunityBestPartition.json similarity index 100% rename from mlprimitives/jsons/community.CommunityBestPartition.json rename to mlprimitives/primitives/community.CommunityBestPartition.json diff --git a/mlprimitives/jsons/cv2.GaussianBlur.json b/mlprimitives/primitives/cv2.GaussianBlur.json similarity index 100% rename from mlprimitives/jsons/cv2.GaussianBlur.json rename to mlprimitives/primitives/cv2.GaussianBlur.json diff --git a/mlprimitives/jsons/featuretools.EntitySet.add_relationship.json b/mlprimitives/primitives/featuretools.EntitySet.add_relationship.json similarity index 94% rename from mlprimitives/jsons/featuretools.EntitySet.add_relationship.json rename to mlprimitives/primitives/featuretools.EntitySet.add_relationship.json index 1528ebb1..11f301fd 100644 --- a/mlprimitives/jsons/featuretools.EntitySet.add_relationship.json +++ b/mlprimitives/primitives/featuretools.EntitySet.add_relationship.json @@ -6,8 +6,7 @@ "documentation": "https://docs.featuretools.com/generated/featuretools.EntitySet.entity_from_dataframe.html#featuretools-entityset-entity-from-dataframe", "description": "Load the data for a specified entity from a Pandas DataFrame. Create an EntitySet if it does not exist", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper" }, "modalities": [], "primitive": "mlprimitives.adapters.featuretools.add_relationship", diff --git a/mlprimitives/jsons/featuretools.EntitySet.entity_from_dataframe.json b/mlprimitives/primitives/featuretools.EntitySet.entity_from_dataframe.json similarity index 97% rename from mlprimitives/jsons/featuretools.EntitySet.entity_from_dataframe.json rename to mlprimitives/primitives/featuretools.EntitySet.entity_from_dataframe.json index d367853c..6a17a25d 100644 --- a/mlprimitives/jsons/featuretools.EntitySet.entity_from_dataframe.json +++ b/mlprimitives/primitives/featuretools.EntitySet.entity_from_dataframe.json @@ -6,8 +6,7 @@ "documentation": "https://docs.featuretools.com/generated/featuretools.EntitySet.entity_from_dataframe.html#featuretools-entityset-entity-from-dataframe", "description": "Load the data for a specified entity from a Pandas DataFrame. Create an EntitySet if it does not exist", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper" }, "modalities": [], "primitive": "mlprimitives.adapters.featuretools.entity_from_dataframe", diff --git a/mlprimitives/jsons/featuretools.dfs.json b/mlprimitives/primitives/featuretools.dfs.json similarity index 100% rename from mlprimitives/jsons/featuretools.dfs.json rename to mlprimitives/primitives/featuretools.dfs.json diff --git a/mlprimitives/jsons/keras.Sequential.LSTMBinaryTextClassifier.json b/mlprimitives/primitives/keras.Sequential.LSTMBinaryTextClassifier.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.LSTMBinaryTextClassifier.json rename to mlprimitives/primitives/keras.Sequential.LSTMBinaryTextClassifier.json diff --git a/mlprimitives/jsons/keras.Sequential.LSTMTextClassifier.json b/mlprimitives/primitives/keras.Sequential.LSTMTextClassifier.json similarity index 99% rename from mlprimitives/jsons/keras.Sequential.LSTMTextClassifier.json rename to mlprimitives/primitives/keras.Sequential.LSTMTextClassifier.json index 3ec46fd3..6a55f6b4 100644 --- a/mlprimitives/jsons/keras.Sequential.LSTMTextClassifier.json +++ b/mlprimitives/primitives/keras.Sequential.LSTMTextClassifier.json @@ -6,7 +6,7 @@ "description": "keras.Sequential.LSTMTextClassifier", "classifiers": { "type": "estimator", - "subtype": "regressor" + "subtype": "classifier" }, "modalities": [ "text" diff --git a/mlprimitives/jsons/keras.Sequential.LSTMTextRegressor.json b/mlprimitives/primitives/keras.Sequential.LSTMTextRegressor.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.LSTMTextRegressor.json rename to mlprimitives/primitives/keras.Sequential.LSTMTextRegressor.json diff --git a/mlprimitives/jsons/keras.Sequential.LSTMTimeSeriesRegressor.json b/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesRegressor.json similarity index 99% rename from mlprimitives/jsons/keras.Sequential.LSTMTimeSeriesRegressor.json rename to mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesRegressor.json index 93504aea..54a2a866 100644 --- a/mlprimitives/jsons/keras.Sequential.LSTMTimeSeriesRegressor.json +++ b/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesRegressor.json @@ -71,7 +71,7 @@ "type": "float", "default": 0.2 }, - "bastch_size": { + "batch_size": { "type": "int", "default": 64 }, diff --git a/mlprimitives/jsons/keras.Sequential.MLPBinaryClassifier.json b/mlprimitives/primitives/keras.Sequential.MLPBinaryClassifier.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.MLPBinaryClassifier.json rename to mlprimitives/primitives/keras.Sequential.MLPBinaryClassifier.json diff --git a/mlprimitives/jsons/keras.Sequential.MLPMultiClassClassifier.json b/mlprimitives/primitives/keras.Sequential.MLPMultiClassClassifier.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.MLPMultiClassClassifier.json rename to mlprimitives/primitives/keras.Sequential.MLPMultiClassClassifier.json diff --git a/mlprimitives/jsons/keras.Sequential.SingleLayerCNNImageClassifier.json b/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageClassifier.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.SingleLayerCNNImageClassifier.json rename to mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageClassifier.json diff --git a/mlprimitives/jsons/keras.Sequential.SingleLayerCNNImageRegressor.json b/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageRegressor.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.SingleLayerCNNImageRegressor.json rename to mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageRegressor.json diff --git a/mlprimitives/jsons/keras.Sequential.VGGCNNClassifier.json b/mlprimitives/primitives/keras.Sequential.VGGCNNClassifier.json similarity index 100% rename from mlprimitives/jsons/keras.Sequential.VGGCNNClassifier.json rename to mlprimitives/primitives/keras.Sequential.VGGCNNClassifier.json diff --git a/mlprimitives/jsons/keras.applications.densenet.DenseNet121.json b/mlprimitives/primitives/keras.applications.densenet.DenseNet121.json similarity index 100% rename from mlprimitives/jsons/keras.applications.densenet.DenseNet121.json rename to mlprimitives/primitives/keras.applications.densenet.DenseNet121.json diff --git a/mlprimitives/jsons/keras.applications.densenet.DenseNet169.json b/mlprimitives/primitives/keras.applications.densenet.DenseNet169.json similarity index 100% rename from mlprimitives/jsons/keras.applications.densenet.DenseNet169.json rename to mlprimitives/primitives/keras.applications.densenet.DenseNet169.json diff --git a/mlprimitives/jsons/keras.applications.densenet.DenseNet201.json b/mlprimitives/primitives/keras.applications.densenet.DenseNet201.json similarity index 100% rename from mlprimitives/jsons/keras.applications.densenet.DenseNet201.json rename to mlprimitives/primitives/keras.applications.densenet.DenseNet201.json diff --git a/mlprimitives/jsons/keras.applications.densenet.preprocess_input.json b/mlprimitives/primitives/keras.applications.densenet.preprocess_input.json similarity index 100% rename from mlprimitives/jsons/keras.applications.densenet.preprocess_input.json rename to mlprimitives/primitives/keras.applications.densenet.preprocess_input.json diff --git a/mlprimitives/jsons/keras.applications.inception_v3.InceptionV3.json b/mlprimitives/primitives/keras.applications.inception_v3.InceptionV3.json similarity index 100% rename from mlprimitives/jsons/keras.applications.inception_v3.InceptionV3.json rename to mlprimitives/primitives/keras.applications.inception_v3.InceptionV3.json diff --git a/mlprimitives/jsons/keras.applications.inception_v3.preprocess_input.json b/mlprimitives/primitives/keras.applications.inception_v3.preprocess_input.json similarity index 100% rename from mlprimitives/jsons/keras.applications.inception_v3.preprocess_input.json rename to mlprimitives/primitives/keras.applications.inception_v3.preprocess_input.json diff --git a/mlprimitives/jsons/keras.applications.mobilenet.MobileNet.json b/mlprimitives/primitives/keras.applications.mobilenet.MobileNet.json similarity index 100% rename from mlprimitives/jsons/keras.applications.mobilenet.MobileNet.json rename to mlprimitives/primitives/keras.applications.mobilenet.MobileNet.json diff --git a/mlprimitives/jsons/keras.applications.mobilenet.preprocess_input.json b/mlprimitives/primitives/keras.applications.mobilenet.preprocess_input.json similarity index 100% rename from mlprimitives/jsons/keras.applications.mobilenet.preprocess_input.json rename to mlprimitives/primitives/keras.applications.mobilenet.preprocess_input.json diff --git a/mlprimitives/jsons/keras.applications.resnet50.ResNet50.json b/mlprimitives/primitives/keras.applications.resnet50.ResNet50.json similarity index 100% rename from mlprimitives/jsons/keras.applications.resnet50.ResNet50.json rename to mlprimitives/primitives/keras.applications.resnet50.ResNet50.json diff --git a/mlprimitives/jsons/keras.applications.resnet50.preprocess_input.json b/mlprimitives/primitives/keras.applications.resnet50.preprocess_input.json similarity index 100% rename from mlprimitives/jsons/keras.applications.resnet50.preprocess_input.json rename to mlprimitives/primitives/keras.applications.resnet50.preprocess_input.json diff --git a/mlprimitives/jsons/keras.applications.xception.Xception.json b/mlprimitives/primitives/keras.applications.xception.Xception.json similarity index 100% rename from mlprimitives/jsons/keras.applications.xception.Xception.json rename to mlprimitives/primitives/keras.applications.xception.Xception.json diff --git a/mlprimitives/jsons/keras.applications.xception.preprocess_input.json b/mlprimitives/primitives/keras.applications.xception.preprocess_input.json similarity index 100% rename from mlprimitives/jsons/keras.applications.xception.preprocess_input.json rename to mlprimitives/primitives/keras.applications.xception.preprocess_input.json diff --git a/mlprimitives/jsons/keras.preprocessing.sequence.pad_sequences.json b/mlprimitives/primitives/keras.preprocessing.sequence.pad_sequences.json similarity index 100% rename from mlprimitives/jsons/keras.preprocessing.sequence.pad_sequences.json rename to mlprimitives/primitives/keras.preprocessing.sequence.pad_sequences.json diff --git a/mlprimitives/jsons/keras.preprocessing.text.Tokenizer.json b/mlprimitives/primitives/keras.preprocessing.text.Tokenizer.json similarity index 100% rename from mlprimitives/jsons/keras.preprocessing.text.Tokenizer.json rename to mlprimitives/primitives/keras.preprocessing.text.Tokenizer.json diff --git a/mlprimitives/jsons/lightfm.LightFM.json b/mlprimitives/primitives/lightfm.LightFM.json similarity index 100% rename from mlprimitives/jsons/lightfm.LightFM.json rename to mlprimitives/primitives/lightfm.LightFM.json diff --git a/mlprimitives/jsons/mlprimitives.candidates.audio_featurization.featurize_audio.json b/mlprimitives/primitives/mlprimitives.candidates.audio_featurization.featurize_audio.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.candidates.audio_featurization.featurize_audio.json rename to mlprimitives/primitives/mlprimitives.candidates.audio_featurization.featurize_audio.json diff --git a/mlprimitives/jsons/mlprimitives.candidates.audio_padding.AudioPadder.json b/mlprimitives/primitives/mlprimitives.candidates.audio_padding.AudioPadder.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.candidates.audio_padding.AudioPadder.json rename to mlprimitives/primitives/mlprimitives.candidates.audio_padding.AudioPadder.json diff --git a/mlprimitives/jsons/mlprimitives.candidates.dsp.SpectralMask.json b/mlprimitives/primitives/mlprimitives.candidates.dsp.SpectralMask.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.candidates.dsp.SpectralMask.json rename to mlprimitives/primitives/mlprimitives.candidates.dsp.SpectralMask.json diff --git a/mlprimitives/jsons/mlprimitives.custom.counters.UniqueCounter.json b/mlprimitives/primitives/mlprimitives.custom.counters.UniqueCounter.json similarity index 93% rename from mlprimitives/jsons/mlprimitives.custom.counters.UniqueCounter.json rename to mlprimitives/primitives/mlprimitives.custom.counters.UniqueCounter.json index 9308cfe2..0dd26805 100644 --- a/mlprimitives/jsons/mlprimitives.custom.counters.UniqueCounter.json +++ b/mlprimitives/primitives/mlprimitives.custom.counters.UniqueCounter.json @@ -5,8 +5,8 @@ ], "description": "Count the number of unique values in each column of a matrix.", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper", + "subtype": "counter" }, "modalities": [], "primitive": "mlprimitives.custom.counters.UniqueCounter", diff --git a/mlprimitives/jsons/mlprimitives.custom.counters.VocabularyCounter.json b/mlprimitives/primitives/mlprimitives.custom.counters.VocabularyCounter.json similarity index 93% rename from mlprimitives/jsons/mlprimitives.custom.counters.VocabularyCounter.json rename to mlprimitives/primitives/mlprimitives.custom.counters.VocabularyCounter.json index 42828aa1..3fc31d9b 100644 --- a/mlprimitives/jsons/mlprimitives.custom.counters.VocabularyCounter.json +++ b/mlprimitives/primitives/mlprimitives.custom.counters.VocabularyCounter.json @@ -5,8 +5,8 @@ ], "description": "Count the number of different words in a collection of texts.", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper", + "subtype": "counter" }, "modalities": [], "primitive": "mlprimitives.custom.counters.VocabularyCounter", diff --git a/mlprimitives/jsons/mlprimitives.custom.counters.count_features.json b/mlprimitives/primitives/mlprimitives.custom.counters.count_features.json similarity index 91% rename from mlprimitives/jsons/mlprimitives.custom.counters.count_features.json rename to mlprimitives/primitives/mlprimitives.custom.counters.count_features.json index 37569b41..e9b203f6 100644 --- a/mlprimitives/jsons/mlprimitives.custom.counters.count_features.json +++ b/mlprimitives/primitives/mlprimitives.custom.counters.count_features.json @@ -5,8 +5,8 @@ ], "description": "Count the number of features in a 2d feature matrix.", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper", + "subtype": "counter" }, "modalities": [], "primitive": "mlprimitives.custom.counters.count_features", diff --git a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.CategoricalEncoder.json b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json similarity index 86% rename from mlprimitives/jsons/mlprimitives.custom.feature_extraction.CategoricalEncoder.json rename to mlprimitives/primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json index b8cb2541..2c139cca 100644 --- a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.CategoricalEncoder.json +++ b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json @@ -36,6 +36,10 @@ }, "hyperparameters": { "fixed": { + "keep": { + "type": "bool", + "default": false + }, "copy": { "type": "bool", "default": true @@ -43,6 +47,10 @@ "features": { "type": "str or list", "default": "auto" + }, + "max_unique_ratio": { + "type": "float", + "default": 1 } }, "tunable": { diff --git a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json similarity index 92% rename from mlprimitives/jsons/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json rename to mlprimitives/primitives/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json index 872767b0..1ea739ea 100644 --- a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json +++ b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.DatetimeFeaturizer.json @@ -36,6 +36,10 @@ }, "hyperparameters": { "fixed": { + "keep": { + "type": "bool", + "default": false + }, "copy": { "type": "bool", "default": true diff --git a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.StringVectorizer.json b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.StringVectorizer.json similarity index 82% rename from mlprimitives/jsons/mlprimitives.custom.feature_extraction.StringVectorizer.json rename to mlprimitives/primitives/mlprimitives.custom.feature_extraction.StringVectorizer.json index c54fe05c..41a85d3d 100644 --- a/mlprimitives/jsons/mlprimitives.custom.feature_extraction.StringVectorizer.json +++ b/mlprimitives/primitives/mlprimitives.custom.feature_extraction.StringVectorizer.json @@ -36,6 +36,14 @@ }, "hyperparameters": { "fixed": { + "keep": { + "type": "bool", + "default": false + }, + "copy": { + "type": "bool", + "default": true + }, "features": { "type": "str or list", "default": "auto" @@ -51,6 +59,10 @@ "analyzer": { "type": "str", "default": "word" + }, + "min_words": { + "type": "int", + "default": 3 } }, "tunable": { @@ -64,10 +76,10 @@ }, "max_features": { "type": "int", - "default": 1, + "default": 1000, "range": [ 1, - 1000 + 10000 ] } } diff --git a/mlprimitives/jsons/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json b/mlprimitives/primitives/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json rename to mlprimitives/primitives/mlprimitives.custom.feature_selection.ExtraTreesClassifierFeatureSelector.json diff --git a/mlprimitives/jsons/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json b/mlprimitives/primitives/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json rename to mlprimitives/primitives/mlprimitives.custom.feature_selection.ExtraTreesRegressorFeatureSelector.json diff --git a/mlprimitives/jsons/mlprimitives.custom.preprocessing.ClassDecoder.json b/mlprimitives/primitives/mlprimitives.custom.preprocessing.ClassDecoder.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.preprocessing.ClassDecoder.json rename to mlprimitives/primitives/mlprimitives.custom.preprocessing.ClassDecoder.json diff --git a/mlprimitives/jsons/mlprimitives.custom.preprocessing.ClassEncoder.json b/mlprimitives/primitives/mlprimitives.custom.preprocessing.ClassEncoder.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.preprocessing.ClassEncoder.json rename to mlprimitives/primitives/mlprimitives.custom.preprocessing.ClassEncoder.json diff --git a/mlprimitives/jsons/mlprimitives.custom.text.TextCleaner.json b/mlprimitives/primitives/mlprimitives.custom.text.TextCleaner.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.text.TextCleaner.json rename to mlprimitives/primitives/mlprimitives.custom.text.TextCleaner.json diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.find_anomalies.json b/mlprimitives/primitives/mlprimitives.custom.timeseries_anomalies.find_anomalies.json similarity index 76% rename from mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.find_anomalies.json rename to mlprimitives/primitives/mlprimitives.custom.timeseries_anomalies.find_anomalies.json index 7cdf43a2..92fae884 100644 --- a/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.find_anomalies.json +++ b/mlprimitives/primitives/mlprimitives.custom.timeseries_anomalies.find_anomalies.json @@ -42,7 +42,15 @@ }, "window_size": { "type": "int", - "default": null + "default": 2000 + }, + "window_step_size": { + "type": "int", + "default": 200 + }, + "lower_threshold": { + "type": "bool", + "default": false } }, "tunable": { @@ -53,6 +61,14 @@ 0.01, 0.9 ] + }, + "anomaly_padding": { + "type": "int", + "default": 50, + "range": [ + 0, + 400 + ] } } } diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.regression_errors.json b/mlprimitives/primitives/mlprimitives.custom.timeseries_anomalies.regression_errors.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.timeseries_anomalies.regression_errors.json rename to mlprimitives/primitives/mlprimitives.custom.timeseries_anomalies.regression_errors.json diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json b/mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json rename to mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences.json diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate.json b/mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate.json rename to mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate.json diff --git a/mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json b/mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json rename to mlprimitives/primitives/mlprimitives.custom.timeseries_preprocessing.time_segments_average.json diff --git a/mlprimitives/jsons/mlprimitives.custom.trivial.TrivialPredictor.json b/mlprimitives/primitives/mlprimitives.custom.trivial.TrivialPredictor.json similarity index 100% rename from mlprimitives/jsons/mlprimitives.custom.trivial.TrivialPredictor.json rename to mlprimitives/primitives/mlprimitives.custom.trivial.TrivialPredictor.json diff --git a/mlprimitives/jsons/networkx.graph_feature_extraction.json b/mlprimitives/primitives/networkx.graph_feature_extraction.json similarity index 100% rename from mlprimitives/jsons/networkx.graph_feature_extraction.json rename to mlprimitives/primitives/networkx.graph_feature_extraction.json diff --git a/mlprimitives/jsons/networkx.link_prediction_feature_extraction.json b/mlprimitives/primitives/networkx.link_prediction_feature_extraction.json similarity index 100% rename from mlprimitives/jsons/networkx.link_prediction_feature_extraction.json rename to mlprimitives/primitives/networkx.link_prediction_feature_extraction.json diff --git a/mlprimitives/jsons/numpy.argmax.json b/mlprimitives/primitives/numpy.argmax.json similarity index 96% rename from mlprimitives/jsons/numpy.argmax.json rename to mlprimitives/primitives/numpy.argmax.json index 96001ee9..112291f3 100644 --- a/mlprimitives/jsons/numpy.argmax.json +++ b/mlprimitives/primitives/numpy.argmax.json @@ -14,6 +14,7 @@ "args": [ { "name": "y", + "keyword": "a", "type": "ndarray" } ], diff --git a/mlprimitives/jsons/pandas.DataFrame.resample.json b/mlprimitives/primitives/pandas.DataFrame.resample.json similarity index 62% rename from mlprimitives/jsons/pandas.DataFrame.resample.json rename to mlprimitives/primitives/pandas.DataFrame.resample.json index 56a883a7..07d9e71a 100644 --- a/mlprimitives/jsons/pandas.DataFrame.resample.json +++ b/mlprimitives/primitives/pandas.DataFrame.resample.json @@ -6,8 +6,7 @@ "documentation": "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html", "description": "Call the `df.resample` method on the given time_index and afterwards call the indicated aggregation.", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper" }, "modalities": [], "primitive": "mlprimitives.adapters.pandas.resample", @@ -28,22 +27,47 @@ }, "hyperparameters": { "fixed": { - "rule": { + "on": { "type": "str", - "description": "The offset string or object representing target conversion." + "default": null, + "description": "Name of the column to use as the time index. Optional." }, "time_index": { "type": "str", - "description": "Name of the column to use as the time index." + "description": "(Deprecated in favor of `on`). Name of the column to use as the time index." }, "groupby": { "type": "str", "default": null, "description": "Optional list of columns to group by." }, + "reset_index": { + "type": "bool", + "default": false + } + }, + "tunable": { + "rule": { + "type": "int", + "default": 3600, + "range": [ + 1, + 86400 + ], + "description": "The offset interval lenght, in seconds." + }, "aggregation": { "type": "str", "default": "mean", + "values": [ + "mean", + "median", + "prod", + "quantile", + "std", + "sum", + "var" + ], "description": "Name of the aggregation function to use." } } diff --git a/mlprimitives/jsons/pandas.DataFrame.unstack.json b/mlprimitives/primitives/pandas.DataFrame.unstack.json similarity index 94% rename from mlprimitives/jsons/pandas.DataFrame.unstack.json rename to mlprimitives/primitives/pandas.DataFrame.unstack.json index 4d7dab12..bd703294 100644 --- a/mlprimitives/jsons/pandas.DataFrame.unstack.json +++ b/mlprimitives/primitives/pandas.DataFrame.unstack.json @@ -6,8 +6,7 @@ "documentation": "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html", "description": "Call the `df.unstack` method using the indicated level and afterwards join the column names using an underscore.", "classifiers": { - "type": "preprocessor", - "subtype": "feature_extractor" + "type": "helper" }, "modalities": [], "primitive": "mlprimitives.adapters.pandas.unstack", diff --git a/mlprimitives/jsons/skimage.feature.hog.json b/mlprimitives/primitives/skimage.feature.hog.json similarity index 100% rename from mlprimitives/jsons/skimage.feature.hog.json rename to mlprimitives/primitives/skimage.feature.hog.json diff --git a/mlprimitives/jsons/sklearn.decomposition.DictionaryLearning.json b/mlprimitives/primitives/sklearn.decomposition.DictionaryLearning.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.DictionaryLearning.json rename to mlprimitives/primitives/sklearn.decomposition.DictionaryLearning.json diff --git a/mlprimitives/jsons/sklearn.decomposition.FactorAnalysis.json b/mlprimitives/primitives/sklearn.decomposition.FactorAnalysis.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.FactorAnalysis.json rename to mlprimitives/primitives/sklearn.decomposition.FactorAnalysis.json diff --git a/mlprimitives/jsons/sklearn.decomposition.FastICA.json b/mlprimitives/primitives/sklearn.decomposition.FastICA.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.FastICA.json rename to mlprimitives/primitives/sklearn.decomposition.FastICA.json diff --git a/mlprimitives/jsons/sklearn.decomposition.KernelPCA.json b/mlprimitives/primitives/sklearn.decomposition.KernelPCA.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.KernelPCA.json rename to mlprimitives/primitives/sklearn.decomposition.KernelPCA.json diff --git a/mlprimitives/jsons/sklearn.decomposition.PCA.json b/mlprimitives/primitives/sklearn.decomposition.PCA.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.PCA.json rename to mlprimitives/primitives/sklearn.decomposition.PCA.json diff --git a/mlprimitives/jsons/sklearn.decomposition.TruncatedSVD.json b/mlprimitives/primitives/sklearn.decomposition.TruncatedSVD.json similarity index 100% rename from mlprimitives/jsons/sklearn.decomposition.TruncatedSVD.json rename to mlprimitives/primitives/sklearn.decomposition.TruncatedSVD.json diff --git a/mlprimitives/jsons/sklearn.ensemble.AdaBoostClassifier.json b/mlprimitives/primitives/sklearn.ensemble.AdaBoostClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.AdaBoostClassifier.json rename to mlprimitives/primitives/sklearn.ensemble.AdaBoostClassifier.json diff --git a/mlprimitives/jsons/sklearn.ensemble.AdaBoostRegressor.json b/mlprimitives/primitives/sklearn.ensemble.AdaBoostRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.AdaBoostRegressor.json rename to mlprimitives/primitives/sklearn.ensemble.AdaBoostRegressor.json diff --git a/mlprimitives/jsons/sklearn.ensemble.BaggingClassifier.json b/mlprimitives/primitives/sklearn.ensemble.BaggingClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.BaggingClassifier.json rename to mlprimitives/primitives/sklearn.ensemble.BaggingClassifier.json diff --git a/mlprimitives/jsons/sklearn.ensemble.BaggingRegressor.json b/mlprimitives/primitives/sklearn.ensemble.BaggingRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.BaggingRegressor.json rename to mlprimitives/primitives/sklearn.ensemble.BaggingRegressor.json diff --git a/mlprimitives/jsons/sklearn.ensemble.ExtraTreesClassifier.json b/mlprimitives/primitives/sklearn.ensemble.ExtraTreesClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.ExtraTreesClassifier.json rename to mlprimitives/primitives/sklearn.ensemble.ExtraTreesClassifier.json diff --git a/mlprimitives/jsons/sklearn.ensemble.ExtraTreesRegressor.json b/mlprimitives/primitives/sklearn.ensemble.ExtraTreesRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.ExtraTreesRegressor.json rename to mlprimitives/primitives/sklearn.ensemble.ExtraTreesRegressor.json diff --git a/mlprimitives/jsons/sklearn.ensemble.GradientBoostingClassifier.json b/mlprimitives/primitives/sklearn.ensemble.GradientBoostingClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.GradientBoostingClassifier.json rename to mlprimitives/primitives/sklearn.ensemble.GradientBoostingClassifier.json diff --git a/mlprimitives/jsons/sklearn.ensemble.GradientBoostingRegressor.json b/mlprimitives/primitives/sklearn.ensemble.GradientBoostingRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.GradientBoostingRegressor.json rename to mlprimitives/primitives/sklearn.ensemble.GradientBoostingRegressor.json diff --git a/mlprimitives/jsons/sklearn.ensemble.IsolationForest.json b/mlprimitives/primitives/sklearn.ensemble.IsolationForest.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.IsolationForest.json rename to mlprimitives/primitives/sklearn.ensemble.IsolationForest.json diff --git a/mlprimitives/jsons/sklearn.ensemble.RandomForestClassifier.json b/mlprimitives/primitives/sklearn.ensemble.RandomForestClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.RandomForestClassifier.json rename to mlprimitives/primitives/sklearn.ensemble.RandomForestClassifier.json diff --git a/mlprimitives/jsons/sklearn.ensemble.RandomForestClassifier_proba.json b/mlprimitives/primitives/sklearn.ensemble.RandomForestClassifier_proba.json similarity index 98% rename from mlprimitives/jsons/sklearn.ensemble.RandomForestClassifier_proba.json rename to mlprimitives/primitives/sklearn.ensemble.RandomForestClassifier_proba.json index 8d3dd595..cab5d1f7 100644 --- a/mlprimitives/jsons/sklearn.ensemble.RandomForestClassifier_proba.json +++ b/mlprimitives/primitives/sklearn.ensemble.RandomForestClassifier_proba.json @@ -8,7 +8,7 @@ "description": "Scikit-learn RandomForestClassifier that uses predict_proba as the produce method.", "classifiers": { "type": "estimator", - "subtype": "classifier" + "subtype": "probability_classifier" }, "modalities": [], "primitive": "sklearn.ensemble.RandomForestClassifier", diff --git a/mlprimitives/jsons/sklearn.ensemble.RandomForestRegressor.json b/mlprimitives/primitives/sklearn.ensemble.RandomForestRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.RandomForestRegressor.json rename to mlprimitives/primitives/sklearn.ensemble.RandomForestRegressor.json diff --git a/mlprimitives/jsons/sklearn.ensemble.RandomTreesEmbedding.json b/mlprimitives/primitives/sklearn.ensemble.RandomTreesEmbedding.json similarity index 100% rename from mlprimitives/jsons/sklearn.ensemble.RandomTreesEmbedding.json rename to mlprimitives/primitives/sklearn.ensemble.RandomTreesEmbedding.json diff --git a/mlprimitives/jsons/sklearn.feature_extraction.text.CountVectorizer.json b/mlprimitives/primitives/sklearn.feature_extraction.text.CountVectorizer.json similarity index 100% rename from mlprimitives/jsons/sklearn.feature_extraction.text.CountVectorizer.json rename to mlprimitives/primitives/sklearn.feature_extraction.text.CountVectorizer.json diff --git a/mlprimitives/jsons/sklearn.feature_extraction.text.TfidfTransformer.json b/mlprimitives/primitives/sklearn.feature_extraction.text.TfidfTransformer.json similarity index 100% rename from mlprimitives/jsons/sklearn.feature_extraction.text.TfidfTransformer.json rename to mlprimitives/primitives/sklearn.feature_extraction.text.TfidfTransformer.json diff --git a/mlprimitives/jsons/sklearn.impute.SimpleImputer.json b/mlprimitives/primitives/sklearn.impute.SimpleImputer.json similarity index 100% rename from mlprimitives/jsons/sklearn.impute.SimpleImputer.json rename to mlprimitives/primitives/sklearn.impute.SimpleImputer.json diff --git a/mlprimitives/jsons/sklearn.linear_model.ElasticNet.json b/mlprimitives/primitives/sklearn.linear_model.ElasticNet.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.ElasticNet.json rename to mlprimitives/primitives/sklearn.linear_model.ElasticNet.json diff --git a/mlprimitives/jsons/sklearn.linear_model.Lars.json b/mlprimitives/primitives/sklearn.linear_model.Lars.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.Lars.json rename to mlprimitives/primitives/sklearn.linear_model.Lars.json diff --git a/mlprimitives/jsons/sklearn.linear_model.Lasso.json b/mlprimitives/primitives/sklearn.linear_model.Lasso.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.Lasso.json rename to mlprimitives/primitives/sklearn.linear_model.Lasso.json diff --git a/mlprimitives/jsons/sklearn.linear_model.LinearRegression.json b/mlprimitives/primitives/sklearn.linear_model.LinearRegression.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.LinearRegression.json rename to mlprimitives/primitives/sklearn.linear_model.LinearRegression.json diff --git a/mlprimitives/jsons/sklearn.linear_model.LogisticRegression.json b/mlprimitives/primitives/sklearn.linear_model.LogisticRegression.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.LogisticRegression.json rename to mlprimitives/primitives/sklearn.linear_model.LogisticRegression.json diff --git a/mlprimitives/jsons/sklearn.linear_model.MultiTaskLasso.json b/mlprimitives/primitives/sklearn.linear_model.MultiTaskLasso.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.MultiTaskLasso.json rename to mlprimitives/primitives/sklearn.linear_model.MultiTaskLasso.json diff --git a/mlprimitives/jsons/sklearn.linear_model.Ridge.json b/mlprimitives/primitives/sklearn.linear_model.Ridge.json similarity index 100% rename from mlprimitives/jsons/sklearn.linear_model.Ridge.json rename to mlprimitives/primitives/sklearn.linear_model.Ridge.json diff --git a/mlprimitives/jsons/sklearn.naive_bayes.MultinomialNB.json b/mlprimitives/primitives/sklearn.naive_bayes.MultinomialNB.json similarity index 97% rename from mlprimitives/jsons/sklearn.naive_bayes.MultinomialNB.json rename to mlprimitives/primitives/sklearn.naive_bayes.MultinomialNB.json index 385d1154..03769a4c 100644 --- a/mlprimitives/jsons/sklearn.naive_bayes.MultinomialNB.json +++ b/mlprimitives/primitives/sklearn.naive_bayes.MultinomialNB.json @@ -9,9 +9,7 @@ "type": "estimator", "subtype": "classifier" }, - "modalities": [ - "text" - ], + "modalities": [], "primitive": "sklearn.naive_bayes.MultinomialNB", "fit": { "method": "fit", diff --git a/mlprimitives/jsons/sklearn.neighbors.KNeighborsClassifier.json b/mlprimitives/primitives/sklearn.neighbors.KNeighborsClassifier.json similarity index 100% rename from mlprimitives/jsons/sklearn.neighbors.KNeighborsClassifier.json rename to mlprimitives/primitives/sklearn.neighbors.KNeighborsClassifier.json diff --git a/mlprimitives/jsons/sklearn.neighbors.KNeighborsClassifier_proba.json b/mlprimitives/primitives/sklearn.neighbors.KNeighborsClassifier_proba.json similarity index 98% rename from mlprimitives/jsons/sklearn.neighbors.KNeighborsClassifier_proba.json rename to mlprimitives/primitives/sklearn.neighbors.KNeighborsClassifier_proba.json index 46c77ee9..454cecd6 100644 --- a/mlprimitives/jsons/sklearn.neighbors.KNeighborsClassifier_proba.json +++ b/mlprimitives/primitives/sklearn.neighbors.KNeighborsClassifier_proba.json @@ -7,7 +7,7 @@ "description": "Scikit-learn KNeighborsClassifier that uses predict_proba as the produce method.", "classifiers": { "type": "estimator", - "subtype": "classifier" + "subtype": "probability_classifier" }, "modalities": [], "primitive": "sklearn.neighbors.KNeighborsClassifier", diff --git a/mlprimitives/jsons/sklearn.neighbors.KNeighborsRegressor.json b/mlprimitives/primitives/sklearn.neighbors.KNeighborsRegressor.json similarity index 100% rename from mlprimitives/jsons/sklearn.neighbors.KNeighborsRegressor.json rename to mlprimitives/primitives/sklearn.neighbors.KNeighborsRegressor.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.Imputer.json b/mlprimitives/primitives/sklearn.preprocessing.Imputer.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.Imputer.json rename to mlprimitives/primitives/sklearn.preprocessing.Imputer.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.MaxAbsScaler.json b/mlprimitives/primitives/sklearn.preprocessing.MaxAbsScaler.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.MaxAbsScaler.json rename to mlprimitives/primitives/sklearn.preprocessing.MaxAbsScaler.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.MinMaxScaler.json b/mlprimitives/primitives/sklearn.preprocessing.MinMaxScaler.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.MinMaxScaler.json rename to mlprimitives/primitives/sklearn.preprocessing.MinMaxScaler.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.OneHotEncoder.json b/mlprimitives/primitives/sklearn.preprocessing.OneHotEncoder.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.OneHotEncoder.json rename to mlprimitives/primitives/sklearn.preprocessing.OneHotEncoder.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.RobustScaler.json b/mlprimitives/primitives/sklearn.preprocessing.RobustScaler.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.RobustScaler.json rename to mlprimitives/primitives/sklearn.preprocessing.RobustScaler.json diff --git a/mlprimitives/jsons/sklearn.preprocessing.StandardScaler.json b/mlprimitives/primitives/sklearn.preprocessing.StandardScaler.json similarity index 100% rename from mlprimitives/jsons/sklearn.preprocessing.StandardScaler.json rename to mlprimitives/primitives/sklearn.preprocessing.StandardScaler.json diff --git a/mlprimitives/jsons/statsmodels.arima_model.Arima.json b/mlprimitives/primitives/statsmodels.arima_model.Arima.json similarity index 100% rename from mlprimitives/jsons/statsmodels.arima_model.Arima.json rename to mlprimitives/primitives/statsmodels.arima_model.Arima.json diff --git a/mlprimitives/jsons/xgboost.XGBClassifier.json b/mlprimitives/primitives/xgboost.XGBClassifier.json similarity index 100% rename from mlprimitives/jsons/xgboost.XGBClassifier.json rename to mlprimitives/primitives/xgboost.XGBClassifier.json diff --git a/mlprimitives/jsons/xgboost.XGBRegressor.json b/mlprimitives/primitives/xgboost.XGBRegressor.json similarity index 100% rename from mlprimitives/jsons/xgboost.XGBRegressor.json rename to mlprimitives/primitives/xgboost.XGBRegressor.json diff --git a/setup.cfg b/setup.cfg index a91ba81b..632e0962 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.10-dev +current_version = 0.2.1-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 519df60d..84f17625 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ install_requires = [ + 'mlblocks>=0.3.0,<0.4', 'Keras>=2.1.6,<3', 'featuretools>=0.6.1,<0.7', 'iso639>=0.1.4,<0.2', @@ -59,8 +60,7 @@ 'm2r>=0.2.0', 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', - 'recommonmark>=0.4.0', - 'ipython==6.5.0', + 'ipython>=6.5.0', # style check 'flake8>=3.7.7', @@ -77,6 +77,9 @@ # Advanced testing 'coverage>=4.5.1', 'tox>=2.9.1', + + # Jupyter + 'jupyter>=1.0.0' ] @@ -87,7 +90,7 @@ setup( - author="MIT Data To AI Lab", + author='MIT Data To AI Lab', author_email='dailabmit@gmail.com', classifiers=[ 'Development Status :: 2 - Pre-Alpha', @@ -98,22 +101,26 @@ 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', ], - description="MLBlocks Primitives", + description='Pipelines and primitives for machine learning and data science.', entry_points = { 'console_scripts': [ 'mlprimitives=mlprimitives.cli:main' ], + 'mlblocks': [ + 'primitives=mlprimitives:MLBLOCKS_PRIMITIVES', + 'pipelines=mlprimitives:MLBLOCKS_PIPELINES' + ], 'mlprimitives': [ - 'jsons_path=mlprimitives:MLPRIMITIVES_JSONS_PATH' + 'jsons_path=mlprimitives:MLBLOCKS_PRIMITIVES', ] }, extras_require=extras_require, install_requires=install_requires, - license="MIT license", + license='MIT license', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', include_package_data=True, - keywords='mlblocks mlprimitives mlblocks_primitives', + keywords='mlblocks mlprimitives pipelines primitives machine learning data science', name='mlprimitives', packages=find_packages(include=['mlprimitives', 'mlprimitives.*']), python_requires='>=3.5', @@ -121,6 +128,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/MLPrimitives', - version='0.1.10-dev', + version='0.2.1-dev', zip_safe=False, ) diff --git a/tests/adapters/test_pandas.py b/tests/adapters/test_pandas.py new file mode 100644 index 00000000..f190cf0a --- /dev/null +++ b/tests/adapters/test_pandas.py @@ -0,0 +1,124 @@ +from datetime import datetime +from unittest import TestCase + +import numpy as np +import pandas as pd +from pandas.util.testing import assert_frame_equal + +from mlprimitives.adapters.pandas import resample + + +class ResampleTest(TestCase): + + def setUp(self): + self.df = pd.DataFrame({ + 'dt': [ + datetime(2000, 1, day + 1, hour, 0) + for day in range(4) + for hour in range(24) + ], + 'value': list(range(4 * 24)) + }) + + def test_resample_rule_str(self): + + out = resample(self.df.set_index('dt'), '1d') + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 11.5}, + {'dt': datetime(2000, 1, 2), 'value': 35.5}, + {'dt': datetime(2000, 1, 3), 'value': 59.5}, + {'dt': datetime(2000, 1, 4), 'value': 83.5}, + ])) + + def test_resample_rule_int(self): + + out = resample(self.df.set_index('dt'), 86400) + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 11.5}, + {'dt': datetime(2000, 1, 2), 'value': 35.5}, + {'dt': datetime(2000, 1, 3), 'value': 59.5}, + {'dt': datetime(2000, 1, 4), 'value': 83.5}, + ])) + + def test_resample_groupby(self): + + self.df['group1'] = ['A', 'B'] * 2 * 24 + self.df['group2'] = ['C', 'C', 'D', 'D'] * 24 + + out = resample(self.df.set_index('dt'), '1d', groupby=['group1', 'group2']) + + assert_frame_equal(out, pd.DataFrame([ + {'group1': 'A', 'group2': 'C', 'dt': datetime(2000, 1, 1), 'value': 10}, + {'group1': 'A', 'group2': 'C', 'dt': datetime(2000, 1, 2), 'value': 34}, + {'group1': 'A', 'group2': 'C', 'dt': datetime(2000, 1, 3), 'value': 58}, + {'group1': 'A', 'group2': 'C', 'dt': datetime(2000, 1, 4), 'value': 82}, + {'group1': 'A', 'group2': 'D', 'dt': datetime(2000, 1, 1), 'value': 12}, + {'group1': 'A', 'group2': 'D', 'dt': datetime(2000, 1, 2), 'value': 36}, + {'group1': 'A', 'group2': 'D', 'dt': datetime(2000, 1, 3), 'value': 60}, + {'group1': 'A', 'group2': 'D', 'dt': datetime(2000, 1, 4), 'value': 84}, + {'group1': 'B', 'group2': 'C', 'dt': datetime(2000, 1, 1), 'value': 11}, + {'group1': 'B', 'group2': 'C', 'dt': datetime(2000, 1, 2), 'value': 35}, + {'group1': 'B', 'group2': 'C', 'dt': datetime(2000, 1, 3), 'value': 59}, + {'group1': 'B', 'group2': 'C', 'dt': datetime(2000, 1, 4), 'value': 83}, + {'group1': 'B', 'group2': 'D', 'dt': datetime(2000, 1, 1), 'value': 13}, + {'group1': 'B', 'group2': 'D', 'dt': datetime(2000, 1, 2), 'value': 37}, + {'group1': 'B', 'group2': 'D', 'dt': datetime(2000, 1, 3), 'value': 61}, + {'group1': 'B', 'group2': 'D', 'dt': datetime(2000, 1, 4), 'value': 85}, + ], columns=['group1', 'group2', 'dt', 'value'])) + + def test_resample_on(self): + + out = resample(self.df, '1d', on='dt') + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 11.5}, + {'dt': datetime(2000, 1, 2), 'value': 35.5}, + {'dt': datetime(2000, 1, 3), 'value': 59.5}, + {'dt': datetime(2000, 1, 4), 'value': 83.5}, + ])) + + def test_resample_reset_index_false(self): + + out = resample(self.df.set_index('dt'), '1d', reset_index=False) + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 11.5}, + {'dt': datetime(2000, 1, 2), 'value': 35.5}, + {'dt': datetime(2000, 1, 3), 'value': 59.5}, + {'dt': datetime(2000, 1, 4), 'value': 83.5}, + ]).set_index('dt')) + + def test_resample_aggregation_str(self): + + out = resample(self.df.set_index('dt'), '1d', aggregation='max') + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 23}, + {'dt': datetime(2000, 1, 2), 'value': 47}, + {'dt': datetime(2000, 1, 3), 'value': 71}, + {'dt': datetime(2000, 1, 4), 'value': 95}, + ])) + + def test_resample_aggregation_func(self): + + out = resample(self.df.set_index('dt'), '1d', aggregation=np.max) + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 23}, + {'dt': datetime(2000, 1, 2), 'value': 47}, + {'dt': datetime(2000, 1, 3), 'value': 71}, + {'dt': datetime(2000, 1, 4), 'value': 95}, + ])) + + def test_resample_aggregation_import(self): + + out = resample(self.df.set_index('dt'), '1d', aggregation='numpy.max') + + assert_frame_equal(out, pd.DataFrame([ + {'dt': datetime(2000, 1, 1), 'value': 23}, + {'dt': datetime(2000, 1, 2), 'value': 47}, + {'dt': datetime(2000, 1, 3), 'value': 71}, + {'dt': datetime(2000, 1, 4), 'value': 95}, + ])) diff --git a/tests/custom/test_feature_extraction.py b/tests/custom/test_feature_extraction.py index ef56ebb8..f3ebcbca 100644 --- a/tests/custom/test_feature_extraction.py +++ b/tests/custom/test_feature_extraction.py @@ -8,17 +8,6 @@ class FeatureExtractorTest(TestCase): - def test_detect_features(self): - X = pd.DataFrame({ - 'a': ['a', 'b', 'c'], - 'b': ['d', 'e', 'f'], - 'c': [1, 2, 3] - }) - - features = FeatureExtractor.detect_features(X) - - assert features == ['a', 'b'] - @classmethod def assert_equal(cls, obj1, obj2): if hasattr(obj1, 'equals'): @@ -33,7 +22,7 @@ def assert_equal(cls, obj1, obj2): def test_fit_features(self): class FE(FeatureExtractor): - detect_features = Mock() + _detect_features = Mock() _fit = Mock() fe = FE(features=['b']) @@ -46,11 +35,11 @@ class FE(FeatureExtractor): fe.fit(X) assert fe._features == ['b'] - assert fe.detect_features.not_called() + assert fe._detect_features.not_called() def test_fit_auto_pandas(self): class FE(FeatureExtractor): - detect_features = Mock(return_value=['a', 'b']) + _detect_features = Mock(return_value=['a', 'b']) _fit = Mock() fe = FE(features='auto') @@ -63,7 +52,7 @@ class FE(FeatureExtractor): fe.fit(X) assert fe._features == ['a', 'b'] - assert fe.detect_features.called_once_with(X) + assert fe._detect_features.called_once_with(X) expected_calls = [ ((pd.Series(['a', 'b', 'c']), ), {}), ((pd.Series(['d', 'e', 'f']), ), {}) @@ -72,7 +61,7 @@ class FE(FeatureExtractor): def test_fit_auto_numpy(self): class FE(FeatureExtractor): - detect_features = Mock(return_value=[0, 1]) + _detect_features = Mock(return_value=[0, 1]) _fit = Mock() fe = FE(features='auto') @@ -85,7 +74,7 @@ class FE(FeatureExtractor): fe.fit(X) assert fe._features == [0, 1] - assert fe.detect_features.called_once_with(X) + assert fe._detect_features.called_once_with(X) expected_calls = [ ((pd.Series(['a', 'b', 'c']), ), {}), ((pd.Series(['d', 'e', 'f']), ), {}) diff --git a/tests/custom/test_timeseries_anomalies.py b/tests/custom/test_timeseries_anomalies.py index ce5fc0e3..ace9da7d 100644 --- a/tests/custom/test_timeseries_anomalies.py +++ b/tests/custom/test_timeseries_anomalies.py @@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal from mlprimitives.custom.timeseries_anomalies import ( - _find_sequences, _get_max_errors, _prune_anomalies, find_anomalies) + _find_sequences, _get_max_errors, _merge_sequences, _prune_anomalies, find_anomalies) class GetMaxErrorsTest(TestCase): @@ -64,7 +64,7 @@ def test_no_sequences(self): max_errors = pd.DataFrame([ [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) - expected = np.ndarray((0, 2)) + expected = np.ndarray((0, 3)) self._run(max_errors, expected) def test_no_anomalies(self): @@ -72,7 +72,7 @@ def test_no_anomalies(self): [0.11, 1, 2], [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) - expected = np.ndarray((0, 2)) + expected = np.ndarray((0, 3)) self._run(max_errors, expected) def test_one_anomaly(self): @@ -81,7 +81,7 @@ def test_one_anomaly(self): [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) expected = np.array([ - [1, 2] + [1, 2, 0.2] ]) self._run(max_errors, expected) @@ -92,8 +92,8 @@ def test_two_anomalies(self): [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) expected = np.array([ - [1, 2], - [4, 5] + [1, 2, 0.3], + [4, 5, 0.2] ]) self._run(max_errors, expected) @@ -105,8 +105,8 @@ def test_two_out_of_three(self): [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) expected = np.array([ - [1, 2], - [4, 5] + [1, 2, 0.3], + [4, 5, 0.22] ]) self._run(max_errors, expected) @@ -118,9 +118,9 @@ def test_two_with_a_gap(self): [0.1, -1, -1] ], columns=['max_error', 'start', 'stop']) expected = np.array([ - [1, 2], - [4, 5], - [7, 8] + [1, 2, 0.3], + [4, 5, 0.21], + [7, 8, 0.2] ]) self._run(max_errors, expected) @@ -128,9 +128,11 @@ def test_two_with_a_gap(self): class FindSequencesTest(TestCase): THRESHOLD = 0.5 + ANOMALY_PADDING = 1 def _run(self, errors, expected, expected_max): - found, max_below = _find_sequences(np.asarray(errors), self.THRESHOLD) + found, max_below = _find_sequences(np.asarray(errors), self.THRESHOLD, + self.ANOMALY_PADDING) np.testing.assert_array_equal(found, expected) assert max_below == expected_max @@ -142,28 +144,57 @@ def test__find_sequences_all_one_sequence(self): self._run([1, 1, 1, 1], [(0, 3)], 0) def test__find_sequences_open_end(self): - self._run([0, 1, 1, 1], [(1, 3)], 0) + self._run([0, 0, 0.4, 1, 1, 1], [(2, 5)], 0) def test__find_sequences_open_start(self): - self._run([1, 1, 1, 0], [(0, 2)], 0) + self._run([1, 1, 1, 0.4, 0, 0], [(0, 3)], 0) def test__find_sequences_middle(self): - self._run([0, 1, 1, 0], [(1, 2)], 0) + self._run([0, 0, 1, 1, 0, 0], [(1, 4)], 0) + + def test__find_sequences_stop(self): + self._run([1, 0, 0, 0, 1, 1], [(0, 1), (3, 5)], 0) + + +class MergeSequencesTest(TestCase): + + def _run(self, sequences, expected): + merged_sequences = _merge_sequences(sequences) + + np.testing.assert_array_equal(merged_sequences, expected) + + def test__merge_sequences_consecutive(self): + self._run([(1, 2, 0.5), (3, 4, 0.5)], [(1, 4, 0.5)]) - def test__find_sequences_stop_length_one(self): - self._run([1, 0, 1, 1], [(0, 0), (2, 3)], 0) + def test__merge_sequences_start_overlap(self): + self._run([(1, 3, 0.5), (2, 4, 0.5)], [(1, 4, 0.5)]) - def test__find_sequences_open_length_one(self): - self._run([1, 0, 0, 1], [(0, 0), (3, 3)], 0) + def test__merge_sequences_start_end_overlap(self): + self._run([(1, 4, 0.5), (2, 3, 0.5)], [(1, 4, 0.5)]) + + def test__merge_sequences_non_consecutive(self): + self._run([(1, 2, 0.5), (4, 5, 0.5)], [(1, 2, 0.5), (4, 5, 0.5)]) + + def test__merge_sequences_consecutive_different_score(self): + self._run([(1, 2, 1.0), (3, 4, 0.5)], [(1, 4, 0.75)]) + + def test__merge_sequences_consecutive_different_score_and_length(self): + self._run([(1, 2, 1.0), (3, 4, 0.5)], [(1, 4, 0.75)]) class FindAnomaliesTest(TestCase): THRESHOLD = 0.5 - INDEX = [10, 11, 12, 13] + INDEX_SHORT = [1, 2, 3, 4] + INDEX_LONG = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + ANOMALY_PADDING = 1 - def _run(self, errors, expected): - found = find_anomalies(np.asarray(errors), self.INDEX) + def _run(self, errors, expected, index=INDEX_SHORT, window_size=None, + window_step_size=None, lower_threshold=False): + found = find_anomalies(np.asarray(errors), index=index, + anomaly_padding=self.ANOMALY_PADDING, + window_size=window_size, window_step_size=window_step_size, + lower_threshold=lower_threshold) assert_allclose(found, expected) @@ -171,14 +202,26 @@ def test_find_anomalies_no_anomalies(self): self._run([0, 0, 0, 0], np.array([])) def test_find_anomalies_one_anomaly(self): - self._run([0, 0.5, 0.5, 0], np.array([[11., 12., 0.025]])) + self._run([0, 0.5, 0.5, 0], np.array([[1., 4., 0.025]])) def test_find_anomalies_open_start(self): - self._run([0.5, 0.5, 0, 0], np.array([[10., 11., 0.025]])) + self._run([0.5, 0.5, 0, 0], np.array([[1., 3., 0.025]])) def test_find_anomalies_open_end(self): - self._run([0, 0, 0.5, 0.5], np.array([[12., 13., 0.025]])) + self._run([0, 0, 0.5, 0.5], np.array([[2., 4., 0.025]])) def test_find_anomalies_two_anomalies(self): - self._run([0.5, 0, 0.5, 0], np.array([[10., 10., 0.025], [12., 12., 0.025]])) - self._run([0., 0.5, 0., 0.5], np.array([[11., 11., 0.025], [13., 13., 0.025]])) + self._run([0.5, 0, 0.5, 0], np.array([[1., 4., 0.025]])) + self._run([0, 0.5, 0, 0.5], np.array([[1., 4., 0.025]])) + + def test_find_anomalies_multiple_non_overlapping_thresholds(self): + self._run([0, 0, 0.5, 0.5, 0, 0, 0.5, 0.5, 0, 0], + np.array([[2., 4., 0.025], [6., 8., 0.025]]), index=self.INDEX_LONG, + window_size=4, window_step_size=4) + + def test_find_anomalies_multiple_overlapping_thresholds(self): + self._run([0, 0, 0.5, 0.5, 0, 0, 0.5, 0.5, 0, 0], np.array([[2., 9., 0.025]]), + index=self.INDEX_LONG, window_size=4, window_step_size=2) + + def test_find_anomalies_lower_threshold(self): + self._run([0.5, 0.5, 0, 0], np.array([[1., 4., 0.025]]), lower_threshold=True) diff --git a/tests/test_jsons.py b/tests/test_primitives.py similarity index 78% rename from tests/test_jsons.py rename to tests/test_primitives.py index dbd2bb91..53a8411c 100644 --- a/tests/test_jsons.py +++ b/tests/test_primitives.py @@ -5,7 +5,7 @@ from mlblocks import MLPipeline -from mlprimitives import MLPRIMITIVES_JSONS_PATH +from mlprimitives import MLBLOCKS_PRIMITIVES HYPERPARAMETER_DEFAULTS = { 'int': 1, @@ -19,10 +19,10 @@ def test_jsons(): """Validate MLBlocks primitive jsons""" - primitives = (f for f in os.listdir(MLPRIMITIVES_JSONS_PATH) if f.endswith('.json')) + primitives = (f for f in os.listdir(MLBLOCKS_PRIMITIVES) if f.endswith('.json')) for primitive_filename in primitives: try: - primitive_path = os.path.join(MLPRIMITIVES_JSONS_PATH, primitive_filename) + primitive_path = os.path.join(MLBLOCKS_PRIMITIVES, primitive_filename) with open(primitive_path, 'r') as f: primitive = json.load(f) @@ -36,7 +36,10 @@ def test_jsons(): init_hyperparameters[name] = HYPERPARAMETER_DEFAULTS.get(type_) block_name = primitive_name + '#1' - mlpipeline = MLPipeline([primitive_name], {block_name: init_hyperparameters}) + mlpipeline = MLPipeline( + primitives=[primitive_name], + init_params={block_name: init_hyperparameters} + ) # Validate methods mlblock = mlpipeline.blocks[block_name]